Spaces:

OrganizedProgrammers
/

Docxtract

Building

App Files Files Community

Lucas ARRIESSE commited on about 19 hours ago

Commit

7e24a88

1 Parent(s): 2dc9b4d

Add support for custom meeting URLS

Browse files

Files changed (4) hide show

api/docs.py +82 -33
schemas.py +12 -4
static/index.html +21 -1
static/js/app.js +29 -4

api/docs.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import asyncio
 from aiolimiter import AsyncLimiter
 from pathlib import Path
 import traceback
@@ -17,6 +18,7 @@ import tempfile
 from lxml import etree
 from bs4 import BeautifulSoup
 from fastapi import Depends, File, HTTPException, UploadFile
 from dependencies import get_http_client, get_llm_router
 from fastapi.responses import StreamingResponse
 from litellm.router import Router
@@ -46,7 +48,8 @@ FORMAT_MIME_TYPES = {
     ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
     ".pdf": "application/pdf",
     ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
-    ".zip": ""
 }
@@ -194,7 +197,7 @@ async def get_doc_archive(url: str, client: AsyncClient) -> tuple[str, str, io.B
                     return (root, ext, io.BytesIO(doc_bytes))
             raise ValueError(
-                "No file with a supported extension type was found in the archive file")
 def apply_docx_revisions(docx_zip: zipfile.ZipFile) -> io.BytesIO:
@@ -207,6 +210,7 @@ def apply_docx_revisions(docx_zip: zipfile.ZipFile) -> io.BytesIO:
     try:
         xml_bytes = docx_zip.read('word/document.xml')
     except KeyError:
         raise FileNotFoundError(
             "word/document.xml not found in the DOCX archive.")
@@ -256,6 +260,7 @@ def apply_docx_revisions(docx_zip: zipfile.ZipFile) -> io.BytesIO:
         new_zip.writestr('word/document.xml', xml_str)
     output.seek(0)
     return output
 # ============================================= Doc routes =========================================================
@@ -272,7 +277,7 @@ async def get_meetings(req: GetMeetingsRequest, http_client: AsyncClient = Depen
     wg_number = re.search(r"\d", working_group).group(0)
     # building corresponding FTP url
-    logging.debug(tsg, wg_number)
     url = "https://www.3gpp.org/ftp/tsg_" + tsg
     logging.debug(url)
@@ -309,44 +314,88 @@ async def get_meetings(req: GetMeetingsRequest, http_client: AsyncClient = Depen
 @router.post("/get_meeting_docs", response_model=GetMeetingDocsResponse)
 async def get_meeting_docs(req: GetMeetingDocsRequest, http_client: AsyncClient = Depends(get_http_client)) -> GetMeetingDocsResponse:
     """
-    Downloads the document list dataframe for a given meeting
     """
-    # FIXME: extract the document URLS from the hyperlinks in the excelsheet using openpyxl?
-    # Extracting WG
-    working_group = req.working_group
-    tsg = re.sub(r"\d+", "", working_group)
-    wg_number = re.search(r"\d", working_group).group(0)
-    url = "https://www.3gpp.org/ftp/tsg_" + tsg
-    logging.info("Fetching TDocs dataframe")
-    resp = await http_client.get(url)
-    soup = BeautifulSoup(resp.text, "html.parser")
-    wg_folders = [item.get_text() for item in soup.select("tr td a")]
-    selected_folder = None
-    for folder in wg_folders:
-        if "wg" + str(wg_number) in folder.lower():
-            selected_folder = folder
-            break
-    url += "/" + selected_folder + "/" + req.meeting + "/docs"
-    resp = await http_client.get(url)
-    soup = BeautifulSoup(resp.text, "html.parser")
-    files = [item.get_text() for item in soup.select("tr td a")
-             if item.get_text().endswith(".xlsx")]
-    if files == []:
-        raise HTTPException(status_code=404, detail="No XLSX has been found")
-    df = pd.read_excel(str(url + "/" + files[0]).replace("#", "%23"))
-    filtered_df = df[~(
-        df["Uploaded"].isna())][["TDoc", "Title", "CR category", "For", "Source", "Type", "Agenda item", "Agenda item description", "TDoc Status"]]
-    filtered_df["URL"] = filtered_df["TDoc"].apply(
-        lambda tdoc: f"{url}/{tdoc}.zip")
-    df = filtered_df.fillna("")
-    return GetMeetingDocsResponse(data=df[["TDoc", "Title", "Type", "For", "TDoc Status", "Agenda item description", "URL"]].to_dict(orient="records"))
 # ==================================================================================================================================

 import asyncio
+from urllib.parse import urlparse
 from aiolimiter import AsyncLimiter
 from pathlib import Path
 import traceback
 from lxml import etree
 from bs4 import BeautifulSoup
 from fastapi import Depends, File, HTTPException, UploadFile
+import urllib
 from dependencies import get_http_client, get_llm_router
 from fastapi.responses import StreamingResponse
 from litellm.router import Router
     ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
     ".pdf": "application/pdf",
     ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+    ".doc": "application/msword",
+    ".ppt": "application/vnd.ms-powerpoint"
 }
                     return (root, ext, io.BytesIO(doc_bytes))
             raise ValueError(
+                f"No file with a supported extension type was found in the archive file: {ext}")
 def apply_docx_revisions(docx_zip: zipfile.ZipFile) -> io.BytesIO:
     try:
         xml_bytes = docx_zip.read('word/document.xml')
+        logging.debug("Read the document XML")
     except KeyError:
         raise FileNotFoundError(
             "word/document.xml not found in the DOCX archive.")
         new_zip.writestr('word/document.xml', xml_str)
     output.seek(0)
+    logging.debug("Exporting new docx revision OK")
     return output
 # ============================================= Doc routes =========================================================
     wg_number = re.search(r"\d", working_group).group(0)
     # building corresponding FTP url
+    logging.debug(f"FTP internal working group ID is {tsg}{wg_number}")
     url = "https://www.3gpp.org/ftp/tsg_" + tsg
     logging.debug(url)
 @router.post("/get_meeting_docs", response_model=GetMeetingDocsResponse)
 async def get_meeting_docs(req: GetMeetingDocsRequest, http_client: AsyncClient = Depends(get_http_client)) -> GetMeetingDocsResponse:
     """
+    Downloads the document list dataframe for a given meeting OR alternatively returns the document list dataframe from the given FTP URL.
+    If `custom_url` field is set in the request schema, the other fields are ignored.
     """
+    if req.custom_url:
+        logging.info(f"Fetching TDocs at custom URL {req.custom_url}")
+        # Only allow 3GPP FTP URLs
+        if '3gpp.org' not in req.custom_url:
+            raise HTTPException(status_code=401)
+        # 3GPP FTP listing is an ASP.NET app which tags available file links with a .file CSS class
+        reponse = await http_client.get(req.custom_url)
+        soup = BeautifulSoup(reponse.text, "html.parser")
+        # retrieve all file links from HTML response
+        file_links = [l.get('href')
+                      for l in soup.select('table > tbody > tr a.file')]
+        # extract all the file names (without extensions)
+        file_names = [
+            urllib.parse.unquote(Path(urllib.parse.urlparse(url).path).stem)
+            for url in file_links
+        ]
+        # create doc list
+        df = pd.DataFrame()
+        df["TDoc"] = file_names
+        df["URL"] = file_links
+        # build document list dataframe
+        DF_COL_TYPES = ["TDoc", "Title", "Type", "For",
+                        "TDoc Status", "Agenda item description", "URL"]
+        for tp in DF_COL_TYPES:
+            df[tp] = "Unknown"
+        df["TDoc"] = file_names
+        df["URL"] = file_links
+        df["Type"] = "TDoc / xxxxCR"
+        return GetMeetingDocsResponse(data=df.to_dict(orient="records"))
+    else:
+        # Walk the FTP directory listing to find the excel sheet which contains all the available documents for the meeting
+        # Extracting WG
+        working_group = req.working_group
+        tsg = re.sub(r"\d+", "", working_group)
+        wg_number = re.search(r"\d", working_group).group(0)
+        url = "https://www.3gpp.org/ftp/tsg_" + tsg
+        logging.info(
+            f"Fetching TDocs dataframe for {working_group}:{req.meeting}")
+        resp = await http_client.get(url)
+        soup = BeautifulSoup(resp.text, "html.parser")
+        wg_folders = [item.get_text() for item in soup.select("tr td a")]
+        selected_folder = None
+        for folder in wg_folders:
+            if "wg" + str(wg_number) in folder.lower():
+                selected_folder = folder
+                break
+        url += "/" + selected_folder + "/" + req.meeting + "/docs"
+        resp = await http_client.get(url)
+        soup = BeautifulSoup(resp.text, "html.parser")
+        files = [item.get_text() for item in soup.select("tr td a")
+                 if item.get_text().endswith(".xlsx")]
+        if files == []:
+            raise HTTPException(
+                status_code=404, detail="No Excel file has been found")
+        df = pd.read_excel(str(url + "/" + files[0]).replace("#", "%23"))
+        filtered_df = df[~(
+            df["Uploaded"].isna())][["TDoc", "Title", "CR category", "For", "Source", "Type", "Agenda item", "Agenda item description", "TDoc Status"]]
+        filtered_df["URL"] = filtered_df["TDoc"].apply(
+            lambda tdoc: f"{url}/{tdoc}.zip")
+        df = filtered_df.fillna("")
+        return GetMeetingDocsResponse(data=df[["TDoc", "Title", "Type", "For", "TDoc Status", "Agenda item description", "URL"]].to_dict(orient="records"))
 # ==================================================================================================================================

schemas.py CHANGED Viewed

@@ -14,9 +14,15 @@ class GetMeetingsResponse(BaseModel):
         ..., description="Mapping of meetings with as key their display name and value the FTP meeting name.")
 class GetMeetingDocsRequest(BaseModel):
-    working_group: str
-    meeting: str
 class GetMeetingDocsResponse(BaseModel):
@@ -40,7 +46,8 @@ class DocInfo(BaseModel):
 class DownloadDocsRequest(BaseModel):
     documents: List[DocInfo] = Field(
         description="List of documents to download")
-    sort_by_agenda_item: bool = Field(default=False, description="Whether to sort the files by their agenda item.")
 # --------------------------------------
@@ -119,7 +126,8 @@ class ReqGroupingRequest(BaseModel):
     requirements: list[RequirementInfo]
     max_n_categories: Optional[int] = Field(
         default=None, description="Max number of categories to construct. Defaults to None")
-    disable_sort_checks: bool = Field(default=False, description="Disable sort checks when grouping requirements")
 class ReqGroupingResponse(BaseModel):

         ..., description="Mapping of meetings with as key their display name and value the FTP meeting name.")
+# FIXME: make WG and meeting optional
 class GetMeetingDocsRequest(BaseModel):
+    working_group: Literal["SA1", "SA2", "SA3", "SA4", "SA5", "SA6",
+                           "CT1", "CT2", "CT3", "CT4", "CT5", "CT6", "RAN1", "RAN2", "CUSTOM_URL"]
+    meeting: str = Field(...,
+                         description="The ID of the corresponding meeting")
+    custom_url: Optional[str] = Field(
+        ..., description="A user provided URL of the 3GPP FTP to scrap for documents. If this field is set, the content of others fields is ignored")
 class GetMeetingDocsResponse(BaseModel):
 class DownloadDocsRequest(BaseModel):
     documents: List[DocInfo] = Field(
         description="List of documents to download")
+    sort_by_agenda_item: bool = Field(
+        default=False, description="Whether to sort the files by their agenda item.")
 # --------------------------------------
     requirements: list[RequirementInfo]
     max_n_categories: Optional[int] = Field(
         default=None, description="Max number of categories to construct. Defaults to None")
+    disable_sort_checks: bool = Field(
+        default=False, description="Disable sort checks when grouping requirements")
 class ReqGroupingResponse(BaseModel):

static/index.html CHANGED Viewed

@@ -35,7 +35,6 @@
                     <label for="working-group-select" class="block text-sm font-medium text-gray-700 mb-2">Working
                         Group</label>
                     <select id="working-group-select" class="w-full p-2 border border-gray-300 rounded-md">
-                        <option value="" selected>Select a working group</option>
                         <option value="SA1">SA1</option>
                         <option value="SA2">SA2</option>
                         <option value="SA3">SA3</option>
@@ -50,6 +49,8 @@
                         <option value="CT6">CT6</option>
                         <option value="RAN1">RAN1</option>
                         <option value="RAN2">RAN2</option>
                     </select>
                 </div>
@@ -67,6 +68,25 @@
                     </div>
                 </div>
             </div>
         </div>
         <hr>

                     <label for="working-group-select" class="block text-sm font-medium text-gray-700 mb-2">Working
                         Group</label>
                     <select id="working-group-select" class="w-full p-2 border border-gray-300 rounded-md">
                         <option value="SA1">SA1</option>
                         <option value="SA2">SA2</option>
                         <option value="SA3">SA3</option>
                         <option value="CT6">CT6</option>
                         <option value="RAN1">RAN1</option>
                         <option value="RAN2">RAN2</option>
+                        <option value="CUSTOM_URL">Custom URL</option>
+                        <option value="" selected>Select a working group</option>
                     </select>
                 </div>
                     </div>
                 </div>
             </div>
+            <!--Custom URL container-->
+            <div class="mt-4 hidden" id="meeting-custom-url-container">
+                <label for="meeting-custom-url" class="block text-sm font-medium text-gray-700 mb-2">Direct URL to
+                    meeting files</label>
+                <label class="input validator w-full">
+                    <svg class="h-[1em] opacity-50" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
+                        <g stroke-linejoin="round" stroke-linecap="round" stroke-width="2.5" fill="none"
+                            stroke="currentColor">
+                            <path d="M10 13a5 5 0 0 0 7.54.54l3-3a5 5 0 0 0-7.07-7.07l-1.72 1.71"></path>
+                            <path d="M14 11a5 5 0 0 0-7.54-.54l-3 3a5 5 0 0 0 7.07 7.07l1.71-1.71"></path>
+                        </g>
+                    </svg>
+                    <input id="meeting-custom-url" name="meeting-custom-url" type="url" required placeholder="https://"
+                        value="https://" pattern="^(https?://)?([a-zA-Z0-9]([a-zA-Z0-9\-].*[a-zA-Z0-9])?\.)+[a-zA-Z].*$"
+                        title="Must be valid URL" />
+                </label>
+                <p class="validator-hint">Must be valid URL</p>
+            </div>
         </div>
         <hr>

static/js/app.js CHANGED Viewed

@@ -39,6 +39,11 @@ async function getMeetings() {
     const workingGroup = document.getElementById('working-group-select').value;
     if (!workingGroup) return;
     showLoadingOverlay('Getting all available meetings...');
     toggleElementsEnabled(['get-meetings-btn'], false);
@@ -66,18 +71,32 @@ async function getMeetings() {
  */
 async function getTDocs() {
     const workingGroup = document.getElementById('working-group-select').value;
     const meeting = document.getElementById('meeting-select').value;
-    if (!workingGroup || !meeting) return;
     showLoadingOverlay('Getting TDocs List...');
     toggleElementsEnabled(['get-tdocs-btn'], false);
     try {
         const response = await fetch('/docs/get_meeting_docs', {
             method: 'POST',
             headers: { 'Content-Type': 'application/json' },
-            body: JSON.stringify({ working_group: workingGroup, meeting: meeting })
         });
         const data = await response.json();
@@ -1064,8 +1083,14 @@ document.addEventListener('DOMContentLoaded', function () {
     bindTabs();
     // Événements des boutons principaux
-    document.getElementById('working-group-select').addEventListener('change', (ev) => {
-        getMeetings();
     });
     document.getElementById('get-tdocs-btn').addEventListener('click', getTDocs);

     const workingGroup = document.getElementById('working-group-select').value;
     if (!workingGroup) return;
+    // early return if CUSTOM_URL option is selected
+    // the custom URL is used in the getTdocs()
+    if (workingGroup == "CUSTOM_URL")
+        return;
     showLoadingOverlay('Getting all available meetings...');
     toggleElementsEnabled(['get-meetings-btn'], false);
  */
 async function getTDocs() {
     const workingGroup = document.getElementById('working-group-select').value;
+    const customMeetingURL = document.getElementById('meeting-custom-url').value;
     const meeting = document.getElementById('meeting-select').value;
+    if (!workingGroup)
+        return;
+    // Handle case if custom meeting url is enabled but not put in
+    if (workingGroup === "CUSTOM_URL" && !customMeetingURL)
+        return;
+    // handle the other case where the working group is a known one with known meeting list
+    if (workingGroup !== "CUSTOM_URL" && !meeting) {
+        return;
+    }
     showLoadingOverlay('Getting TDocs List...');
     toggleElementsEnabled(['get-tdocs-btn'], false);
     try {
+        const custom_url = workingGroup === "CUSTOM_URL" ? customMeetingURL : null;
+        const meetingName = workingGroup === "CUSTOM_URL" ? "NO_MEETING" : meeting;
         const response = await fetch('/docs/get_meeting_docs', {
             method: 'POST',
             headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify({ working_group: workingGroup, meeting: meetingName, custom_url: custom_url })
         });
         const data = await response.json();
     bindTabs();
     // Événements des boutons principaux
+    document.getElementById('working-group-select').addEventListener('change', getMeetings);
+    document.getElementById('working-group-select').addEventListener('change', e => {
+        // show the custom meeting URL for CUSTOM_URL working group selection
+        const custom_url_div = document.getElementById('meeting-custom-url-container');
+        if (e.target.value == "CUSTOM_URL")
+            custom_url_div.classList.remove("hidden");
+        else
+            custom_url_div.classList.add("hidden");
     });
     document.getElementById('get-tdocs-btn').addEventListener('click', getTDocs);