Spaces:

adriiita
/

TeachingAssistant

Sleeping

App Files Files Community

adriiita commited on Nov 20, 2024

Commit

c17054a

verified ·

1 Parent(s): 268ef51

Update processors/input_processor.py

Browse files

Files changed (1) hide show

processors/input_processor.py +40 -18

processors/input_processor.py CHANGED Viewed

@@ -13,6 +13,11 @@ from youtube_transcript_api import (
     NoTranscriptAvailable
 )
 import re
 class ContentProcessor:
     def __init__(self):
@@ -32,29 +37,46 @@ class ContentProcessor:
         return pages
     def process_youtube(self, video_url):
-        video_id = self._extract_video_id(video_url)
-        if not video_id:
-            raise ValueError("This appears to be an invalid YouTube URL. Please check the URL and try again.")
         try:
-            transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
-            full_transcript = " ".join([entry['text'] for entry in transcript_list])
-            # Create a document-like structure
-            from langchain.schema import Document
-            doc = Document(
-                page_content=full_transcript,
-                metadata={"source": video_url}
-            )
-            return self.text_splitter.split_documents([doc])
-        except TranscriptsDisabled:
-            raise Exception("This video does not have subtitles/captions enabled. Please try a different video that has captions available.")
-        except NoTranscriptFound:
-            raise Exception("No transcript was found for this video. Please try a different video that has captions available.")
         except Exception as e:
-            raise Exception(f"Unable to get transcript: {str(e)}. Please ensure the video has captions enabled.")
     def _extract_video_id(self, url):
         # Handle different YouTube URL formats

     NoTranscriptAvailable
 )
 import re
+import logging
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 class ContentProcessor:
     def __init__(self):
         return pages
     def process_youtube(self, video_url):
         try:
+            # Log the incoming URL
+            logger.info(f"Processing YouTube URL: {video_url}")
+            video_id = self._extract_video_id(video_url)
+            if not video_id:
+                logger.error(f"Invalid YouTube URL: {video_url}")
+                raise ValueError("This appears to be an invalid YouTube URL. Please check the URL and try again.")
+            # Log the extracted video ID
+            logger.info(f"Extracted video ID: {video_id}")
+            # List available transcripts
+            try:
+                transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
+                logger.info(f"Available transcripts: {transcript_list}")
+            except Exception as e:
+                logger.error(f"Error listing transcripts: {str(e)}")
+            # Try to get the transcript
+            try:
+                transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
+                full_transcript = " ".join([entry['text'] for entry in transcript_list])
+                # Create a document-like structure
+                from langchain.schema import Document
+                doc = Document(
+                    page_content=full_transcript,
+                    metadata={"source": video_url}
+                )
+                return self.text_splitter.split_documents([doc])
+            except Exception as e:
+                logger.error(f"Error getting transcript: {str(e)}")
+                raise Exception(f"Unable to access video transcript. Error: {str(e)}\nPlease try a video with available captions.")
         except Exception as e:
+            logger.error(f"Process failed: {str(e)}")
+            raise
     def _extract_video_id(self, url):
         # Handle different YouTube URL formats