WaysAheadGlobal commited on
Commit
24e5396
Β·
verified Β·
1 Parent(s): 3da8dd4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -17
app.py CHANGED
@@ -1,13 +1,12 @@
1
- # app.py
2
-
3
  import streamlit as st
 
4
  from transformers import pipeline
5
  from PIL import Image
6
- import requests
7
-
8
- st.set_page_config(page_title="TinyLLaVA (Streamlit)", layout="centered")
9
- st.title("πŸ¦™ TinyLLaVA β€” Vision-Language Q&A")
10
 
 
11
  pipe = pipeline(
12
  task="image-to-text",
13
  model="bczhou/tiny-llava-v1-hf",
@@ -15,17 +14,53 @@ pipe = pipeline(
15
  device_map="cpu"
16
  )
17
 
18
- uploaded_file = st.file_uploader("πŸ“· Upload an image", type=["jpg","png","jpeg"])
19
- prompt = st.text_input("πŸ’¬ Ask a question (post `<image>` token):", value="What is happening?")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
- if uploaded_file and prompt:
22
- image = Image.open(uploaded_file).convert("RGB")
23
- st.image(image, caption="Uploaded Image", use_column_width=True)
24
 
25
- query = f"USER: <image>\n{prompt}\nASSISTANT:"
26
- with st.spinner("Generating answer..."):
27
- result = pipe(query, image)
28
- answer = result[0]["generated_text"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
- st.subheader("πŸ“ Answer:")
31
- st.write(answer)
 
 
 
 
 
1
  import streamlit as st
2
+ from streamlit_webrtc import VideoTransformerBase, webrtc_streamer, RTCConfiguration
3
  from transformers import pipeline
4
  from PIL import Image
5
+ import cv2
6
+ import numpy as np
7
+ import time
 
8
 
9
+ # Load TinyLLaVA pipeline once
10
  pipe = pipeline(
11
  task="image-to-text",
12
  model="bczhou/tiny-llava-v1-hf",
 
14
  device_map="cpu"
15
  )
16
 
17
+ st.set_page_config(page_title="TinyLLaVA Webcam", layout="centered")
18
+ st.title("πŸ¦™ TinyLLaVA β€” Webcam Captioning")
19
+
20
+ # Shared state
21
+ st_frame = st.empty()
22
+ result_box = st.empty()
23
+
24
+ class VideoProcessor(VideoTransformerBase):
25
+ def __init__(self):
26
+ self.last_run = 0
27
+ self.interval = 5 # seconds
28
+ self.last_caption = ""
29
+
30
+ def transform(self, frame):
31
+ img = frame.to_ndarray(format="bgr24")
32
+
33
+ now = time.time()
34
+ if now - self.last_run > self.interval:
35
+ self.last_run = now
36
 
37
+ # Convert BGR to RGB
38
+ img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
39
+ pil_image = Image.fromarray(img_rgb)
40
 
41
+ # Run TinyLLaVA pipeline
42
+ prompt = "Describe this scene in detail."
43
+ query = f"USER: <image>\n{prompt}\nASSISTANT:"
44
+ with st.spinner("TinyLLaVA is thinking..."):
45
+ result = pipe(query, pil_image)
46
+ self.last_caption = result[0]["generated_text"]
47
+
48
+ # Return the same frame, unmodified
49
+ return img
50
+
51
+ # RTC config
52
+ rtc_config = RTCConfiguration(
53
+ {"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]}
54
+ )
55
+
56
+ webrtc_ctx = webrtc_streamer(
57
+ key="example",
58
+ video_processor_factory=VideoProcessor,
59
+ rtc_configuration=rtc_config,
60
+ media_stream_constraints={"video": True, "audio": False}
61
+ )
62
 
63
+ if webrtc_ctx.video_processor:
64
+ st.info("Keep your webcam on. The app captures 1 frame every 5 seconds and generates a caption.")
65
+ st.write("Latest Caption:")
66
+ st.write(webrtc_ctx.video_processor.last_caption)