Lucas ARRIESSE commited on
Commit
7e24a88
·
1 Parent(s): 2dc9b4d

Add support for custom meeting URLS

Browse files
Files changed (4) hide show
  1. api/docs.py +82 -33
  2. schemas.py +12 -4
  3. static/index.html +21 -1
  4. static/js/app.js +29 -4
api/docs.py CHANGED
@@ -1,4 +1,5 @@
1
  import asyncio
 
2
  from aiolimiter import AsyncLimiter
3
  from pathlib import Path
4
  import traceback
@@ -17,6 +18,7 @@ import tempfile
17
  from lxml import etree
18
  from bs4 import BeautifulSoup
19
  from fastapi import Depends, File, HTTPException, UploadFile
 
20
  from dependencies import get_http_client, get_llm_router
21
  from fastapi.responses import StreamingResponse
22
  from litellm.router import Router
@@ -46,7 +48,8 @@ FORMAT_MIME_TYPES = {
46
  ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
47
  ".pdf": "application/pdf",
48
  ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
49
- ".zip": ""
 
50
  }
51
 
52
 
@@ -194,7 +197,7 @@ async def get_doc_archive(url: str, client: AsyncClient) -> tuple[str, str, io.B
194
  return (root, ext, io.BytesIO(doc_bytes))
195
 
196
  raise ValueError(
197
- "No file with a supported extension type was found in the archive file")
198
 
199
 
200
  def apply_docx_revisions(docx_zip: zipfile.ZipFile) -> io.BytesIO:
@@ -207,6 +210,7 @@ def apply_docx_revisions(docx_zip: zipfile.ZipFile) -> io.BytesIO:
207
 
208
  try:
209
  xml_bytes = docx_zip.read('word/document.xml')
 
210
  except KeyError:
211
  raise FileNotFoundError(
212
  "word/document.xml not found in the DOCX archive.")
@@ -256,6 +260,7 @@ def apply_docx_revisions(docx_zip: zipfile.ZipFile) -> io.BytesIO:
256
  new_zip.writestr('word/document.xml', xml_str)
257
 
258
  output.seek(0)
 
259
  return output
260
 
261
  # ============================================= Doc routes =========================================================
@@ -272,7 +277,7 @@ async def get_meetings(req: GetMeetingsRequest, http_client: AsyncClient = Depen
272
  wg_number = re.search(r"\d", working_group).group(0)
273
 
274
  # building corresponding FTP url
275
- logging.debug(tsg, wg_number)
276
  url = "https://www.3gpp.org/ftp/tsg_" + tsg
277
  logging.debug(url)
278
 
@@ -309,44 +314,88 @@ async def get_meetings(req: GetMeetingsRequest, http_client: AsyncClient = Depen
309
  @router.post("/get_meeting_docs", response_model=GetMeetingDocsResponse)
310
  async def get_meeting_docs(req: GetMeetingDocsRequest, http_client: AsyncClient = Depends(get_http_client)) -> GetMeetingDocsResponse:
311
  """
312
- Downloads the document list dataframe for a given meeting
 
313
  """
314
 
315
- # FIXME: extract the document URLS from the hyperlinks in the excelsheet using openpyxl?
 
316
 
317
- # Extracting WG
318
- working_group = req.working_group
319
- tsg = re.sub(r"\d+", "", working_group)
320
- wg_number = re.search(r"\d", working_group).group(0)
321
- url = "https://www.3gpp.org/ftp/tsg_" + tsg
322
- logging.info("Fetching TDocs dataframe")
323
 
324
- resp = await http_client.get(url)
325
- soup = BeautifulSoup(resp.text, "html.parser")
326
- wg_folders = [item.get_text() for item in soup.select("tr td a")]
327
- selected_folder = None
328
- for folder in wg_folders:
329
- if "wg" + str(wg_number) in folder.lower():
330
- selected_folder = folder
331
- break
 
 
 
 
 
332
 
333
- url += "/" + selected_folder + "/" + req.meeting + "/docs"
334
- resp = await http_client.get(url)
335
- soup = BeautifulSoup(resp.text, "html.parser")
336
- files = [item.get_text() for item in soup.select("tr td a")
337
- if item.get_text().endswith(".xlsx")]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
 
339
- if files == []:
340
- raise HTTPException(status_code=404, detail="No XLSX has been found")
 
341
 
342
- df = pd.read_excel(str(url + "/" + files[0]).replace("#", "%23"))
343
- filtered_df = df[~(
344
- df["Uploaded"].isna())][["TDoc", "Title", "CR category", "For", "Source", "Type", "Agenda item", "Agenda item description", "TDoc Status"]]
345
- filtered_df["URL"] = filtered_df["TDoc"].apply(
346
- lambda tdoc: f"{url}/{tdoc}.zip")
347
 
348
- df = filtered_df.fillna("")
349
- return GetMeetingDocsResponse(data=df[["TDoc", "Title", "Type", "For", "TDoc Status", "Agenda item description", "URL"]].to_dict(orient="records"))
350
 
351
  # ==================================================================================================================================
352
 
 
1
  import asyncio
2
+ from urllib.parse import urlparse
3
  from aiolimiter import AsyncLimiter
4
  from pathlib import Path
5
  import traceback
 
18
  from lxml import etree
19
  from bs4 import BeautifulSoup
20
  from fastapi import Depends, File, HTTPException, UploadFile
21
+ import urllib
22
  from dependencies import get_http_client, get_llm_router
23
  from fastapi.responses import StreamingResponse
24
  from litellm.router import Router
 
48
  ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
49
  ".pdf": "application/pdf",
50
  ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
51
+ ".doc": "application/msword",
52
+ ".ppt": "application/vnd.ms-powerpoint"
53
  }
54
 
55
 
 
197
  return (root, ext, io.BytesIO(doc_bytes))
198
 
199
  raise ValueError(
200
+ f"No file with a supported extension type was found in the archive file: {ext}")
201
 
202
 
203
  def apply_docx_revisions(docx_zip: zipfile.ZipFile) -> io.BytesIO:
 
210
 
211
  try:
212
  xml_bytes = docx_zip.read('word/document.xml')
213
+ logging.debug("Read the document XML")
214
  except KeyError:
215
  raise FileNotFoundError(
216
  "word/document.xml not found in the DOCX archive.")
 
260
  new_zip.writestr('word/document.xml', xml_str)
261
 
262
  output.seek(0)
263
+ logging.debug("Exporting new docx revision OK")
264
  return output
265
 
266
  # ============================================= Doc routes =========================================================
 
277
  wg_number = re.search(r"\d", working_group).group(0)
278
 
279
  # building corresponding FTP url
280
+ logging.debug(f"FTP internal working group ID is {tsg}{wg_number}")
281
  url = "https://www.3gpp.org/ftp/tsg_" + tsg
282
  logging.debug(url)
283
 
 
314
  @router.post("/get_meeting_docs", response_model=GetMeetingDocsResponse)
315
  async def get_meeting_docs(req: GetMeetingDocsRequest, http_client: AsyncClient = Depends(get_http_client)) -> GetMeetingDocsResponse:
316
  """
317
+ Downloads the document list dataframe for a given meeting OR alternatively returns the document list dataframe from the given FTP URL.
318
+ If `custom_url` field is set in the request schema, the other fields are ignored.
319
  """
320
 
321
+ if req.custom_url:
322
+ logging.info(f"Fetching TDocs at custom URL {req.custom_url}")
323
 
324
+ # Only allow 3GPP FTP URLs
325
+ if '3gpp.org' not in req.custom_url:
326
+ raise HTTPException(status_code=401)
 
 
 
327
 
328
+ # 3GPP FTP listing is an ASP.NET app which tags available file links with a .file CSS class
329
+ reponse = await http_client.get(req.custom_url)
330
+ soup = BeautifulSoup(reponse.text, "html.parser")
331
+
332
+ # retrieve all file links from HTML response
333
+ file_links = [l.get('href')
334
+ for l in soup.select('table > tbody > tr a.file')]
335
+
336
+ # extract all the file names (without extensions)
337
+ file_names = [
338
+ urllib.parse.unquote(Path(urllib.parse.urlparse(url).path).stem)
339
+ for url in file_links
340
+ ]
341
 
342
+ # create doc list
343
+ df = pd.DataFrame()
344
+ df["TDoc"] = file_names
345
+ df["URL"] = file_links
346
+
347
+ # build document list dataframe
348
+ DF_COL_TYPES = ["TDoc", "Title", "Type", "For",
349
+ "TDoc Status", "Agenda item description", "URL"]
350
+
351
+ for tp in DF_COL_TYPES:
352
+ df[tp] = "Unknown"
353
+
354
+ df["TDoc"] = file_names
355
+ df["URL"] = file_links
356
+ df["Type"] = "TDoc / xxxxCR"
357
+
358
+ return GetMeetingDocsResponse(data=df.to_dict(orient="records"))
359
+
360
+ else:
361
+ # Walk the FTP directory listing to find the excel sheet which contains all the available documents for the meeting
362
+
363
+ # Extracting WG
364
+ working_group = req.working_group
365
+ tsg = re.sub(r"\d+", "", working_group)
366
+ wg_number = re.search(r"\d", working_group).group(0)
367
+ url = "https://www.3gpp.org/ftp/tsg_" + tsg
368
+
369
+ logging.info(
370
+ f"Fetching TDocs dataframe for {working_group}:{req.meeting}")
371
+
372
+ resp = await http_client.get(url)
373
+ soup = BeautifulSoup(resp.text, "html.parser")
374
+ wg_folders = [item.get_text() for item in soup.select("tr td a")]
375
+ selected_folder = None
376
+ for folder in wg_folders:
377
+ if "wg" + str(wg_number) in folder.lower():
378
+ selected_folder = folder
379
+ break
380
+
381
+ url += "/" + selected_folder + "/" + req.meeting + "/docs"
382
+ resp = await http_client.get(url)
383
+ soup = BeautifulSoup(resp.text, "html.parser")
384
+ files = [item.get_text() for item in soup.select("tr td a")
385
+ if item.get_text().endswith(".xlsx")]
386
 
387
+ if files == []:
388
+ raise HTTPException(
389
+ status_code=404, detail="No Excel file has been found")
390
 
391
+ df = pd.read_excel(str(url + "/" + files[0]).replace("#", "%23"))
392
+ filtered_df = df[~(
393
+ df["Uploaded"].isna())][["TDoc", "Title", "CR category", "For", "Source", "Type", "Agenda item", "Agenda item description", "TDoc Status"]]
394
+ filtered_df["URL"] = filtered_df["TDoc"].apply(
395
+ lambda tdoc: f"{url}/{tdoc}.zip")
396
 
397
+ df = filtered_df.fillna("")
398
+ return GetMeetingDocsResponse(data=df[["TDoc", "Title", "Type", "For", "TDoc Status", "Agenda item description", "URL"]].to_dict(orient="records"))
399
 
400
  # ==================================================================================================================================
401
 
schemas.py CHANGED
@@ -14,9 +14,15 @@ class GetMeetingsResponse(BaseModel):
14
  ..., description="Mapping of meetings with as key their display name and value the FTP meeting name.")
15
 
16
 
 
17
  class GetMeetingDocsRequest(BaseModel):
18
- working_group: str
19
- meeting: str
 
 
 
 
 
20
 
21
 
22
  class GetMeetingDocsResponse(BaseModel):
@@ -40,7 +46,8 @@ class DocInfo(BaseModel):
40
  class DownloadDocsRequest(BaseModel):
41
  documents: List[DocInfo] = Field(
42
  description="List of documents to download")
43
- sort_by_agenda_item: bool = Field(default=False, description="Whether to sort the files by their agenda item.")
 
44
 
45
  # --------------------------------------
46
 
@@ -119,7 +126,8 @@ class ReqGroupingRequest(BaseModel):
119
  requirements: list[RequirementInfo]
120
  max_n_categories: Optional[int] = Field(
121
  default=None, description="Max number of categories to construct. Defaults to None")
122
- disable_sort_checks: bool = Field(default=False, description="Disable sort checks when grouping requirements")
 
123
 
124
 
125
  class ReqGroupingResponse(BaseModel):
 
14
  ..., description="Mapping of meetings with as key their display name and value the FTP meeting name.")
15
 
16
 
17
+ # FIXME: make WG and meeting optional
18
  class GetMeetingDocsRequest(BaseModel):
19
+ working_group: Literal["SA1", "SA2", "SA3", "SA4", "SA5", "SA6",
20
+ "CT1", "CT2", "CT3", "CT4", "CT5", "CT6", "RAN1", "RAN2", "CUSTOM_URL"]
21
+ meeting: str = Field(...,
22
+ description="The ID of the corresponding meeting")
23
+
24
+ custom_url: Optional[str] = Field(
25
+ ..., description="A user provided URL of the 3GPP FTP to scrap for documents. If this field is set, the content of others fields is ignored")
26
 
27
 
28
  class GetMeetingDocsResponse(BaseModel):
 
46
  class DownloadDocsRequest(BaseModel):
47
  documents: List[DocInfo] = Field(
48
  description="List of documents to download")
49
+ sort_by_agenda_item: bool = Field(
50
+ default=False, description="Whether to sort the files by their agenda item.")
51
 
52
  # --------------------------------------
53
 
 
126
  requirements: list[RequirementInfo]
127
  max_n_categories: Optional[int] = Field(
128
  default=None, description="Max number of categories to construct. Defaults to None")
129
+ disable_sort_checks: bool = Field(
130
+ default=False, description="Disable sort checks when grouping requirements")
131
 
132
 
133
  class ReqGroupingResponse(BaseModel):
static/index.html CHANGED
@@ -35,7 +35,6 @@
35
  <label for="working-group-select" class="block text-sm font-medium text-gray-700 mb-2">Working
36
  Group</label>
37
  <select id="working-group-select" class="w-full p-2 border border-gray-300 rounded-md">
38
- <option value="" selected>Select a working group</option>
39
  <option value="SA1">SA1</option>
40
  <option value="SA2">SA2</option>
41
  <option value="SA3">SA3</option>
@@ -50,6 +49,8 @@
50
  <option value="CT6">CT6</option>
51
  <option value="RAN1">RAN1</option>
52
  <option value="RAN2">RAN2</option>
 
 
53
  </select>
54
  </div>
55
 
@@ -67,6 +68,25 @@
67
  </div>
68
  </div>
69
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  </div>
71
 
72
  <hr>
 
35
  <label for="working-group-select" class="block text-sm font-medium text-gray-700 mb-2">Working
36
  Group</label>
37
  <select id="working-group-select" class="w-full p-2 border border-gray-300 rounded-md">
 
38
  <option value="SA1">SA1</option>
39
  <option value="SA2">SA2</option>
40
  <option value="SA3">SA3</option>
 
49
  <option value="CT6">CT6</option>
50
  <option value="RAN1">RAN1</option>
51
  <option value="RAN2">RAN2</option>
52
+ <option value="CUSTOM_URL">Custom URL</option>
53
+ <option value="" selected>Select a working group</option>
54
  </select>
55
  </div>
56
 
 
68
  </div>
69
  </div>
70
  </div>
71
+
72
+ <!--Custom URL container-->
73
+ <div class="mt-4 hidden" id="meeting-custom-url-container">
74
+ <label for="meeting-custom-url" class="block text-sm font-medium text-gray-700 mb-2">Direct URL to
75
+ meeting files</label>
76
+ <label class="input validator w-full">
77
+ <svg class="h-[1em] opacity-50" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
78
+ <g stroke-linejoin="round" stroke-linecap="round" stroke-width="2.5" fill="none"
79
+ stroke="currentColor">
80
+ <path d="M10 13a5 5 0 0 0 7.54.54l3-3a5 5 0 0 0-7.07-7.07l-1.72 1.71"></path>
81
+ <path d="M14 11a5 5 0 0 0-7.54-.54l-3 3a5 5 0 0 0 7.07 7.07l1.71-1.71"></path>
82
+ </g>
83
+ </svg>
84
+ <input id="meeting-custom-url" name="meeting-custom-url" type="url" required placeholder="https://"
85
+ value="https://" pattern="^(https?://)?([a-zA-Z0-9]([a-zA-Z0-9\-].*[a-zA-Z0-9])?\.)+[a-zA-Z].*$"
86
+ title="Must be valid URL" />
87
+ </label>
88
+ <p class="validator-hint">Must be valid URL</p>
89
+ </div>
90
  </div>
91
 
92
  <hr>
static/js/app.js CHANGED
@@ -39,6 +39,11 @@ async function getMeetings() {
39
  const workingGroup = document.getElementById('working-group-select').value;
40
  if (!workingGroup) return;
41
 
 
 
 
 
 
42
  showLoadingOverlay('Getting all available meetings...');
43
  toggleElementsEnabled(['get-meetings-btn'], false);
44
 
@@ -66,18 +71,32 @@ async function getMeetings() {
66
  */
67
  async function getTDocs() {
68
  const workingGroup = document.getElementById('working-group-select').value;
 
69
  const meeting = document.getElementById('meeting-select').value;
70
 
71
- if (!workingGroup || !meeting) return;
 
 
 
 
 
 
 
 
 
 
72
 
73
  showLoadingOverlay('Getting TDocs List...');
74
  toggleElementsEnabled(['get-tdocs-btn'], false);
75
 
76
  try {
 
 
 
77
  const response = await fetch('/docs/get_meeting_docs', {
78
  method: 'POST',
79
  headers: { 'Content-Type': 'application/json' },
80
- body: JSON.stringify({ working_group: workingGroup, meeting: meeting })
81
  });
82
 
83
  const data = await response.json();
@@ -1064,8 +1083,14 @@ document.addEventListener('DOMContentLoaded', function () {
1064
  bindTabs();
1065
 
1066
  // Événements des boutons principaux
1067
- document.getElementById('working-group-select').addEventListener('change', (ev) => {
1068
- getMeetings();
 
 
 
 
 
 
1069
  });
1070
 
1071
  document.getElementById('get-tdocs-btn').addEventListener('click', getTDocs);
 
39
  const workingGroup = document.getElementById('working-group-select').value;
40
  if (!workingGroup) return;
41
 
42
+ // early return if CUSTOM_URL option is selected
43
+ // the custom URL is used in the getTdocs()
44
+ if (workingGroup == "CUSTOM_URL")
45
+ return;
46
+
47
  showLoadingOverlay('Getting all available meetings...');
48
  toggleElementsEnabled(['get-meetings-btn'], false);
49
 
 
71
  */
72
  async function getTDocs() {
73
  const workingGroup = document.getElementById('working-group-select').value;
74
+ const customMeetingURL = document.getElementById('meeting-custom-url').value;
75
  const meeting = document.getElementById('meeting-select').value;
76
 
77
+ if (!workingGroup)
78
+ return;
79
+
80
+ // Handle case if custom meeting url is enabled but not put in
81
+ if (workingGroup === "CUSTOM_URL" && !customMeetingURL)
82
+ return;
83
+
84
+ // handle the other case where the working group is a known one with known meeting list
85
+ if (workingGroup !== "CUSTOM_URL" && !meeting) {
86
+ return;
87
+ }
88
 
89
  showLoadingOverlay('Getting TDocs List...');
90
  toggleElementsEnabled(['get-tdocs-btn'], false);
91
 
92
  try {
93
+ const custom_url = workingGroup === "CUSTOM_URL" ? customMeetingURL : null;
94
+ const meetingName = workingGroup === "CUSTOM_URL" ? "NO_MEETING" : meeting;
95
+
96
  const response = await fetch('/docs/get_meeting_docs', {
97
  method: 'POST',
98
  headers: { 'Content-Type': 'application/json' },
99
+ body: JSON.stringify({ working_group: workingGroup, meeting: meetingName, custom_url: custom_url })
100
  });
101
 
102
  const data = await response.json();
 
1083
  bindTabs();
1084
 
1085
  // Événements des boutons principaux
1086
+ document.getElementById('working-group-select').addEventListener('change', getMeetings);
1087
+ document.getElementById('working-group-select').addEventListener('change', e => {
1088
+ // show the custom meeting URL for CUSTOM_URL working group selection
1089
+ const custom_url_div = document.getElementById('meeting-custom-url-container');
1090
+ if (e.target.value == "CUSTOM_URL")
1091
+ custom_url_div.classList.remove("hidden");
1092
+ else
1093
+ custom_url_div.classList.add("hidden");
1094
  });
1095
 
1096
  document.getElementById('get-tdocs-btn').addEventListener('click', getTDocs);