Spaces:
Running
Running
Template Image v0
#5
by
ibombonato
- opened
- .gitattributes +2 -0
- README.md +4 -2
- app.py +103 -9
- assets/template_1.png +3 -0
- assets/template_2.png +3 -0
- fragrantica_crew.py +4 -3
- image_generator_tool.py +9 -13
- stealth_scrape_tool.py +75 -14
.gitattributes
CHANGED
@@ -36,3 +36,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
36 |
assets/Montserrat-Bold.ttf filter=lfs diff=lfs merge=lfs -text
|
37 |
assets/Montserrat-Regular.ttf filter=lfs diff=lfs merge=lfs -text
|
38 |
assets/template_natura_empty.jpg filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
36 |
assets/Montserrat-Bold.ttf filter=lfs diff=lfs merge=lfs -text
|
37 |
assets/Montserrat-Regular.ttf filter=lfs diff=lfs merge=lfs -text
|
38 |
assets/template_natura_empty.jpg filter=lfs diff=lfs merge=lfs -text
|
39 |
+
assets/template_1.png filter=lfs diff=lfs merge=lfs -text
|
40 |
+
assets/template_2.png filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -19,7 +19,7 @@ The system uses a Gradio interface (`app.py`) with three main tabs:
|
|
19 |
* **Expert Perfume Analyst and Web Data Extractor:** This agent extracts detailed perfume information (notes, accords, longevity, sillage, similar fragrances, reviews) from the Fragrantica page.
|
20 |
* **Fragrance Expert Woman and Perfume Analysis Reporter:** This agent synthesizes the extracted data into a human-friendly report, including graded evaluations and personalized recommendations.
|
21 |
|
22 |
-
3. **Image Ad Generator:** This tab allows users to generate a promotional image for a product. It takes the product name, original price, final price, a coupon code, and a product image URL as input. The tool then generates a promotional image with this information, based on a template.
|
23 |
|
24 |
## Merchant Support
|
25 |
|
@@ -62,5 +62,7 @@ The application now supports generating ad copy for both **Natura** and **Mercad
|
|
62 |
- [x] Add support for any model/api key supported by LiteLLM.
|
63 |
- [x] Add Fragrantica support, where user will input a Fragrantica URL and the agent will extract and generate a Perfume Analysis report.
|
64 |
- [x] Support Mercado Livre Merchant
|
65 |
-
- [
|
|
|
|
|
66 |
- [] Create carroussel images for Fragrantica post
|
|
|
19 |
* **Expert Perfume Analyst and Web Data Extractor:** This agent extracts detailed perfume information (notes, accords, longevity, sillage, similar fragrances, reviews) from the Fragrantica page.
|
20 |
* **Fragrance Expert Woman and Perfume Analysis Reporter:** This agent synthesizes the extracted data into a human-friendly report, including graded evaluations and personalized recommendations.
|
21 |
|
22 |
+
3. **Image Ad Generator:** This tab allows users to generate a promotional image for a product. It takes the product name, original price, final price, a coupon code, and a product image URL as input. The tool then generates a promotional image with this information, based on a template. The generated image is saved as a temporary file and displayed in the interface.
|
23 |
|
24 |
## Merchant Support
|
25 |
|
|
|
62 |
- [x] Add support for any model/api key supported by LiteLLM.
|
63 |
- [x] Add Fragrantica support, where user will input a Fragrantica URL and the agent will extract and generate a Perfume Analysis report.
|
64 |
- [x] Support Mercado Livre Merchant
|
65 |
+
- [x] Add image templates
|
66 |
+
- [wip] Add share button to images, that work with instagram on mobile. Android/IPhone.
|
67 |
+
- [] Make image template trigger/working seamless with data from generated post.
|
68 |
- [] Create carroussel images for Fragrantica post
|
app.py
CHANGED
@@ -1,6 +1,9 @@
|
|
1 |
import gradio as gr
|
2 |
import os
|
3 |
import requests
|
|
|
|
|
|
|
4 |
from crewai import Agent, Task, Crew, Process, LLM
|
5 |
from dotenv import load_dotenv
|
6 |
from stealth_scrape_tool import StealthScrapeTool
|
@@ -140,6 +143,42 @@ def clean_env_vars():
|
|
140 |
os.environ.pop("OPENAI_BASE_URL", None)
|
141 |
os.environ.pop("OPENAI_MODEL_NAME", None)
|
142 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
# --- Gradio Interface ---
|
144 |
def generate_ad(product_url: str, store_name: str, main_cupom: str, main_cupom_discount_percentage: float, cupom_1: str, original_price: float, discounted_price: float, openai_api_key: str, natura_api_token: str, openai_base_url: str, openai_model_name: str):
|
145 |
yield gr.update(interactive=False, value="Generating..."), gr.Markdown(value="⏳ Generating ad... Please wait.")
|
@@ -175,7 +214,7 @@ with gr.Blocks() as demo:
|
|
175 |
original_price_input = gr.Number(label="Original Price (Optional)", value=0, minimum=0)
|
176 |
discounted_price_input = gr.Number(label="Discounted Price (Optional)", value=0, minimum=0)
|
177 |
with gr.Row():
|
178 |
-
|
179 |
clear_button = gr.Button("Clear")
|
180 |
ad_output = gr.Markdown(label="Your Generated Ad", show_copy_button=True)
|
181 |
|
@@ -185,6 +224,25 @@ with gr.Blocks() as demo:
|
|
185 |
analyze_fragrantica_button = gr.Button("Analyze Fragrantica Product")
|
186 |
fragrantica_output = gr.Markdown(label="Fragrantica Analysis Report")
|
187 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
with gr.Tab("Settings"):
|
189 |
gr.Markdown("### ⚙️ API Key Settings")
|
190 |
gr.Markdown("Enter your API keys below. These will be used for the current session.")
|
@@ -200,21 +258,57 @@ with gr.Blocks() as demo:
|
|
200 |
def clear_fields():
|
201 |
return "", 0, 0
|
202 |
|
203 |
-
generate_button.click(generate_ad, inputs=[url_input, store_name_input, main_cupom_input, main_cupom_discount_percentage_input, cupom_1_input, original_price_input, discounted_price_input, openai_key_input, natura_token_input, openai_base_url_input, openai_model_name_input], outputs=[generate_button, ad_output])
|
204 |
-
clear_button.click(clear_fields, inputs=[], outputs=[url_input, original_price_input, discounted_price_input])
|
205 |
-
|
206 |
-
# Placeholder for Fragrantica analysis function
|
207 |
def analyze_fragrantica_url(url, openai_api_key, natura_api_token, openai_base_url, openai_model_name):
|
|
|
208 |
if not openai_api_key or not openai_model_name or not openai_base_url:
|
209 |
-
|
|
|
210 |
from fragrantica_crew import FragranticaCrew
|
211 |
fragrantica_crew = FragranticaCrew(openai_api_key, openai_base_url, openai_model_name)
|
212 |
report = fragrantica_crew.kickoff(url=url)
|
213 |
if report == "SCRAPING_FAILED":
|
214 |
-
|
215 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
216 |
|
|
|
|
|
217 |
analyze_fragrantica_button.click(analyze_fragrantica_url, inputs=[fragrantica_url_input, openai_key_input, natura_token_input, openai_base_url_input, openai_model_name_input], outputs=fragrantica_output)
|
|
|
|
|
|
|
|
|
|
|
218 |
|
219 |
if __name__ == "__main__":
|
220 |
-
demo.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
1 |
import gradio as gr
|
2 |
import os
|
3 |
import requests
|
4 |
+
import base64
|
5 |
+
from io import BytesIO
|
6 |
+
from PIL import Image
|
7 |
from crewai import Agent, Task, Crew, Process, LLM
|
8 |
from dotenv import load_dotenv
|
9 |
from stealth_scrape_tool import StealthScrapeTool
|
|
|
143 |
os.environ.pop("OPENAI_BASE_URL", None)
|
144 |
os.environ.pop("OPENAI_MODEL_NAME", None)
|
145 |
|
146 |
+
js_share_logic = """
|
147 |
+
async (image_data_url) => {
|
148 |
+
if (!image_data_url) {
|
149 |
+
console.error("Share button clicked, but no image data URL found.");
|
150 |
+
alert("No image to share. Please generate an image first.");
|
151 |
+
return;
|
152 |
+
}
|
153 |
+
|
154 |
+
const dataUrl = image_data_url;
|
155 |
+
|
156 |
+
try {
|
157 |
+
const response = await fetch(dataUrl);
|
158 |
+
const blob = await response.blob();
|
159 |
+
const file = new File([blob], "shared_image.png", { type: "image/png" });
|
160 |
+
|
161 |
+
if (navigator.share && navigator.canShare({ files: [file] })) {
|
162 |
+
await navigator.share({
|
163 |
+
files: [file],
|
164 |
+
title: "Check out this image!",
|
165 |
+
text: "Created with a cool Gradio app.",
|
166 |
+
});
|
167 |
+
} else {
|
168 |
+
alert("Sharing not supported on this browser.");
|
169 |
+
}
|
170 |
+
} catch (error) {
|
171 |
+
if (error.name === 'AbortError') {
|
172 |
+
console.log("Share dialog cancelled by user.");
|
173 |
+
// Do nothing, as this is a user-initiated cancellation
|
174 |
+
} else {
|
175 |
+
console.error("Error sharing:", error);
|
176 |
+
alert("An error occurred while trying to share.");
|
177 |
+
}
|
178 |
+
}
|
179 |
+
}
|
180 |
+
"""
|
181 |
+
|
182 |
# --- Gradio Interface ---
|
183 |
def generate_ad(product_url: str, store_name: str, main_cupom: str, main_cupom_discount_percentage: float, cupom_1: str, original_price: float, discounted_price: float, openai_api_key: str, natura_api_token: str, openai_base_url: str, openai_model_name: str):
|
184 |
yield gr.update(interactive=False, value="Generating..."), gr.Markdown(value="⏳ Generating ad... Please wait.")
|
|
|
214 |
original_price_input = gr.Number(label="Original Price (Optional)", value=0, minimum=0)
|
215 |
discounted_price_input = gr.Number(label="Discounted Price (Optional)", value=0, minimum=0)
|
216 |
with gr.Row():
|
217 |
+
generate_ad_button = gr.Button("Generate Ad")
|
218 |
clear_button = gr.Button("Clear")
|
219 |
ad_output = gr.Markdown(label="Your Generated Ad", show_copy_button=True)
|
220 |
|
|
|
224 |
analyze_fragrantica_button = gr.Button("Analyze Fragrantica Product")
|
225 |
fragrantica_output = gr.Markdown(label="Fragrantica Analysis Report")
|
226 |
|
227 |
+
with gr.Tab("Images"):
|
228 |
+
gr.Markdown("### 🖼️ Generate Promotional Image")
|
229 |
+
with gr.Row():
|
230 |
+
with gr.Column():
|
231 |
+
image_product_url_input = gr.Textbox(label="Product Image URL", placeholder="Enter product image URL...")
|
232 |
+
image_product_name_input = gr.Textbox(label="Product Name", placeholder="Enter product name...")
|
233 |
+
image_original_price_input = gr.Number(label="Original Price", placeholder="Enter original price...")
|
234 |
+
image_final_price_input = gr.Number(label="Final Price", placeholder="Enter final price...")
|
235 |
+
image_coupon_code_input = gr.Textbox(label="Coupon Code", placeholder="Enter coupon code...")
|
236 |
+
gen_image_btn = gr.Button("Generate Image")
|
237 |
+
with gr.Column():
|
238 |
+
image_output = gr.Image(label="Generated Image", height=500, type="pil", interactive=False)
|
239 |
+
image_data_url_storage = gr.Textbox(
|
240 |
+
label="Image Data URL (for debugging)",
|
241 |
+
visible=True,
|
242 |
+
elem_id="image_data_url_storage"
|
243 |
+
)
|
244 |
+
share_button = gr.Button("🚀 Share Image", interactive=False)
|
245 |
+
|
246 |
with gr.Tab("Settings"):
|
247 |
gr.Markdown("### ⚙️ API Key Settings")
|
248 |
gr.Markdown("Enter your API keys below. These will be used for the current session.")
|
|
|
258 |
def clear_fields():
|
259 |
return "", 0, 0
|
260 |
|
|
|
|
|
|
|
|
|
261 |
def analyze_fragrantica_url(url, openai_api_key, natura_api_token, openai_base_url, openai_model_name):
|
262 |
+
yield "⏳ Analyzing Fragrantica product... Please wait." # Loading message
|
263 |
if not openai_api_key or not openai_model_name or not openai_base_url:
|
264 |
+
yield "Please configure your API keys in the settings section below."
|
265 |
+
return
|
266 |
from fragrantica_crew import FragranticaCrew
|
267 |
fragrantica_crew = FragranticaCrew(openai_api_key, openai_base_url, openai_model_name)
|
268 |
report = fragrantica_crew.kickoff(url=url)
|
269 |
if report == "SCRAPING_FAILED":
|
270 |
+
yield "❌ Scraping failed. The website could not be accessed or parsed. Please check the URL or try again later."
|
271 |
+
return
|
272 |
+
yield report.raw
|
273 |
+
|
274 |
+
def image_to_base64(pil_image):
|
275 |
+
print(f"image_to_base64 called. pil_image type: {type(pil_image)}")
|
276 |
+
if pil_image is None:
|
277 |
+
print("pil_image is None. Returning empty string and interactive=False.")
|
278 |
+
return "", gr.update(interactive=False)
|
279 |
+
buffered = BytesIO()
|
280 |
+
pil_image.save(buffered, format="PNG")
|
281 |
+
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
282 |
+
data_url = f"data:image/png;base64,{img_str}"
|
283 |
+
print("Returning data_url and interactive=True.")
|
284 |
+
return data_url, gr.update(interactive=True)
|
285 |
+
|
286 |
+
def generate_image(product_image_url, product_name, original_price, final_price, coupon_code):
|
287 |
+
tool = GenerateImageTool()
|
288 |
+
original_price_str = f"{original_price:.2f}".replace('.', ',')
|
289 |
+
final_price_str = f"{final_price:.2f}".replace('.', ',')
|
290 |
+
|
291 |
+
yield gr.update(interactive=False, value="Generating..."), None
|
292 |
+
|
293 |
+
image_path = tool._run(
|
294 |
+
product_image_url=product_image_url,
|
295 |
+
product_name=product_name,
|
296 |
+
original_price=original_price_str,
|
297 |
+
final_price=final_price_str,
|
298 |
+
coupon_code=coupon_code
|
299 |
+
)
|
300 |
+
|
301 |
+
pil_image = Image.open(image_path)
|
302 |
+
yield gr.update(interactive=True, value="Generate Image"), pil_image
|
303 |
|
304 |
+
generate_ad_button.click(generate_ad, inputs=[url_input, store_name_input, main_cupom_input, main_cupom_discount_percentage_input, cupom_1_input, original_price_input, discounted_price_input, openai_key_input, natura_token_input, openai_base_url_input, openai_model_name_input], outputs=[generate_ad_button, ad_output])
|
305 |
+
clear_button.click(clear_fields, inputs=[], outputs=[url_input, original_price_input, discounted_price_input])
|
306 |
analyze_fragrantica_button.click(analyze_fragrantica_url, inputs=[fragrantica_url_input, openai_key_input, natura_token_input, openai_base_url_input, openai_model_name_input], outputs=fragrantica_output)
|
307 |
+
gen_image_btn.click(generate_image,
|
308 |
+
inputs=[image_product_url_input, image_product_name_input, image_original_price_input, image_final_price_input, image_coupon_code_input],
|
309 |
+
outputs=[gen_image_btn, image_output])
|
310 |
+
image_output.change(image_to_base64, inputs=image_output, outputs=[image_data_url_storage, share_button])
|
311 |
+
share_button.click(fn=None, inputs=[image_data_url_storage], js=js_share_logic)
|
312 |
|
313 |
if __name__ == "__main__":
|
314 |
+
demo.launch(server_name="0.0.0.0", server_port=7860)
|
assets/template_1.png
ADDED
![]() |
Git LFS Details
|
assets/template_2.png
ADDED
![]() |
Git LFS Details
|
fragrantica_crew.py
CHANGED
@@ -12,7 +12,8 @@ class FragranticaCrew:
|
|
12 |
llm = LLM(
|
13 |
api_key=self.openai_api_key,
|
14 |
model=self.openai_model_name,
|
15 |
-
base_url=self.openai_base_url
|
|
|
16 |
)
|
17 |
|
18 |
self.research_agent = Agent(
|
@@ -39,7 +40,7 @@ class FragranticaCrew:
|
|
39 |
def kickoff(self, url: str) -> str:
|
40 |
research_task = Task(
|
41 |
description=(
|
42 |
-
f"""1. Scrape the content of the URL: {url} using the 'Stealth Web Scraper' tool with `website_url` as {url}
|
43 |
|
44 |
2. If scraping is successful, carefully analyze the entire page content to extract the following information:
|
45 |
|
@@ -54,7 +55,7 @@ class FragranticaCrew:
|
|
54 |
|
55 |
- Este Perfume me Lembra do: Find the section titled "Este perfume me lembra do", and list the perfume names mentioned there.
|
56 |
|
57 |
-
- Opinião dos usuários:
|
58 |
|
59 |
3. Present the extracted information in a clear, structured format, ready for reporting. If any specific piece of information cannot be found, check again to make sure they are not found, after check again, if you truly do not find the info, state 'N/A' for that field. If the entire scraping process fails, return "SCRAPING_FAILED".
|
60 |
"""
|
|
|
12 |
llm = LLM(
|
13 |
api_key=self.openai_api_key,
|
14 |
model=self.openai_model_name,
|
15 |
+
base_url=self.openai_base_url,
|
16 |
+
max_tokens=32768
|
17 |
)
|
18 |
|
19 |
self.research_agent = Agent(
|
|
|
40 |
def kickoff(self, url: str) -> str:
|
41 |
research_task = Task(
|
42 |
description=(
|
43 |
+
f"""1. Scrape the content of the URL: {url} using the 'Stealth Web Scraper' tool with `website_url` as {url}, `css_element` as "#main-content", and `wait_for_selectors` as "['#all-reviews']". If the scraping tool fails or returns empty content ONCE, try the `css_element` as "body", If they also fail when you pass `css_element` as "body", then you MUST return the exact string "SCRAPING_FAILED".
|
44 |
|
45 |
2. If scraping is successful, carefully analyze the entire page content to extract the following information:
|
46 |
|
|
|
55 |
|
56 |
- Este Perfume me Lembra do: Find the section titled "Este perfume me lembra do", and list the perfume names mentioned there.
|
57 |
|
58 |
+
- Opinião dos usuários: Locate the HTML div with ID "all-reviews" within the scraped content. From this div, extract the *full text content* of all individual user reviews. Then, synthesize a detailed summary from these extracted texts, focusing on common themes, sentiments, and key observations from the users.
|
59 |
|
60 |
3. Present the extracted information in a clear, structured format, ready for reporting. If any specific piece of information cannot be found, check again to make sure they are not found, after check again, if you truly do not find the info, state 'N/A' for that field. If the entire scraping process fails, return "SCRAPING_FAILED".
|
61 |
"""
|
image_generator_tool.py
CHANGED
@@ -3,7 +3,7 @@ from pydantic import BaseModel, Field
|
|
3 |
from PIL import Image, ImageDraw, ImageFont
|
4 |
import requests
|
5 |
from io import BytesIO
|
6 |
-
import
|
7 |
|
8 |
class GenerateImageToolInput(BaseModel):
|
9 |
"""Input for the Generate Image Tool."""
|
@@ -14,18 +14,14 @@ class GenerateImageToolInput(BaseModel):
|
|
14 |
coupon_code: str = Field(..., description="Coupon code to be displayed on the image.")
|
15 |
|
16 |
import tempfile
|
17 |
-
import os
|
18 |
|
19 |
class GenerateImageTool(BaseTool):
|
20 |
name: str = "Generate Image Tool"
|
21 |
-
description: str = "Generates a promotional image for a product using a template."
|
22 |
args_schema = GenerateImageToolInput
|
23 |
|
24 |
def _run(self, product_image_url: str, product_name: str, original_price: str, final_price: str, coupon_code: str) -> str:
|
25 |
-
|
26 |
-
template_path = 'assets/template_natura_empty.jpg'
|
27 |
-
temp_dir = tempfile.gettempdir()
|
28 |
-
output_path = os.path.join(temp_dir, f'{uuid.uuid4()}.png')
|
29 |
|
30 |
try:
|
31 |
template_image = Image.open(template_path).convert("RGBA")
|
@@ -64,15 +60,15 @@ class GenerateImageTool(BaseTool):
|
|
64 |
black_color = "#000000"
|
65 |
|
66 |
draw.text((360, 710), product_name, font=font_name, fill=white_color, anchor="ms")
|
67 |
-
draw.text((360, 800), original_price, font=font_price_from, fill=white_color, anchor="ms")
|
68 |
-
draw.text((360, 860), final_price, font=font_price, fill=yellow_color, anchor="ms")
|
69 |
draw.text((360, 993), coupon_code, font=font_cupom, fill=black_color, anchor="ms")
|
70 |
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
|
75 |
except FileNotFoundError:
|
76 |
return f"Error: The template file '{template_path}' was not found."
|
77 |
except Exception as e:
|
78 |
-
return f"An error occurred: {e}"
|
|
|
3 |
from PIL import Image, ImageDraw, ImageFont
|
4 |
import requests
|
5 |
from io import BytesIO
|
6 |
+
import base64
|
7 |
|
8 |
class GenerateImageToolInput(BaseModel):
|
9 |
"""Input for the Generate Image Tool."""
|
|
|
14 |
coupon_code: str = Field(..., description="Coupon code to be displayed on the image.")
|
15 |
|
16 |
import tempfile
|
|
|
17 |
|
18 |
class GenerateImageTool(BaseTool):
|
19 |
name: str = "Generate Image Tool"
|
20 |
+
description: str = "Generates a promotional image for a product using a template and returns the file path."
|
21 |
args_schema = GenerateImageToolInput
|
22 |
|
23 |
def _run(self, product_image_url: str, product_name: str, original_price: str, final_price: str, coupon_code: str) -> str:
|
24 |
+
template_path = 'assets/template_1.png'
|
|
|
|
|
|
|
25 |
|
26 |
try:
|
27 |
template_image = Image.open(template_path).convert("RGBA")
|
|
|
60 |
black_color = "#000000"
|
61 |
|
62 |
draw.text((360, 710), product_name, font=font_name, fill=white_color, anchor="ms")
|
63 |
+
draw.text((360, 800), f"De: R$ {original_price}", font=font_price_from, fill=white_color, anchor="ms")
|
64 |
+
draw.text((360, 860), f"Por: R$ {final_price}", font=font_price, fill=yellow_color, anchor="ms")
|
65 |
draw.text((360, 993), coupon_code, font=font_cupom, fill=black_color, anchor="ms")
|
66 |
|
67 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
|
68 |
+
template_image.save(temp_file.name)
|
69 |
+
return temp_file.name
|
70 |
|
71 |
except FileNotFoundError:
|
72 |
return f"Error: The template file '{template_path}' was not found."
|
73 |
except Exception as e:
|
74 |
+
return f"An error occurred: {e[:1000]}"
|
stealth_scrape_tool.py
CHANGED
@@ -8,38 +8,99 @@ class StealthScrapeTool(BaseTool):
|
|
8 |
name: str = "Stealth Web Scraper"
|
9 |
description: str = "A tool for stealthily scraping content from a given URL using Playwright and a CSS selector."
|
10 |
|
11 |
-
async def _arun(self, website_url: str, css_element = "body") -> str:
|
12 |
try:
|
13 |
async with Stealth().use_async(async_playwright()) as p:
|
14 |
browser = await p.chromium.launch(headless=True)
|
15 |
page = await browser.new_page()
|
16 |
|
|
|
|
|
17 |
await page.goto(website_url, timeout=120000)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
-
try:
|
20 |
-
# Wait for the specific element to be present
|
21 |
-
await page.wait_for_selector(css_element, timeout=30000)
|
22 |
-
except Exception as e:
|
23 |
-
# If timeout error, try again with "body" as css_element
|
24 |
-
if "Timeout" in str(e) and css_element != "body":
|
25 |
-
await page.wait_for_selector("body", timeout=60000)
|
26 |
-
css_element = "body"
|
27 |
-
else:
|
28 |
-
raise e
|
29 |
|
|
|
30 |
html_content = await page.content()
|
31 |
soup = BeautifulSoup(html_content, 'html.parser')
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
target_element = soup.select_one(css_element)
|
34 |
if target_element:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
return target_element.prettify()
|
36 |
else:
|
37 |
return f"Error: Could not find element with selector '{css_element}' on the page."
|
38 |
except Exception as e:
|
39 |
return f"Error during stealth web scraping: {e}"
|
40 |
|
41 |
-
def _run(self, website_url: str, css_element: str) -> str:
|
42 |
# This method is for synchronous execution, which is not ideal for Playwright.
|
43 |
# CrewAI typically calls _arun for async tools.
|
44 |
# For simplicity, we'll just call the async version here.
|
45 |
-
return asyncio.run(self._arun(website_url, css_element))
|
|
|
8 |
name: str = "Stealth Web Scraper"
|
9 |
description: str = "A tool for stealthily scraping content from a given URL using Playwright and a CSS selector."
|
10 |
|
11 |
+
async def _arun(self, website_url: str, css_element = "body", wait_for_selectors: list[str] = None) -> str:
|
12 |
try:
|
13 |
async with Stealth().use_async(async_playwright()) as p:
|
14 |
browser = await p.chromium.launch(headless=True)
|
15 |
page = await browser.new_page()
|
16 |
|
17 |
+
print(f"StealthScrapeTool: Starting scraping for {website_url}...")
|
18 |
+
print(f"StealthScrapeTool: Navigating to {website_url}")
|
19 |
await page.goto(website_url, timeout=120000)
|
20 |
+
await asyncio.sleep(5)
|
21 |
+
|
22 |
+
# Scroll to the bottom of the page repeatedly to load all dynamic content
|
23 |
+
print("StealthScrapeTool: Scrolling through the page to load dynamic content...")
|
24 |
+
print("StealthScrapeTool: Getting initial scrollHeight...")
|
25 |
+
last_height = await page.evaluate("document.body.scrollHeight")
|
26 |
+
print(f"StealthScrapeTool: Initial scrollHeight: {last_height}")
|
27 |
+
scroll_attempts = 0
|
28 |
+
max_scroll_attempts = 10
|
29 |
+
|
30 |
+
while scroll_attempts < max_scroll_attempts:
|
31 |
+
print(f"StealthScrapeTool: Scroll attempt {scroll_attempts + 1}")
|
32 |
+
print("StealthScrapeTool: Scrolling to bottom...")
|
33 |
+
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
34 |
+
print("StealthScrapeTool: Scrolled. Waiting for content to load...")
|
35 |
+
|
36 |
+
await asyncio.sleep(5)
|
37 |
+
|
38 |
+
print("StealthScrapeTool: Getting new scrollHeight...")
|
39 |
+
new_height = await page.evaluate("document.body.scrollHeight")
|
40 |
+
print(f"StealthScrapeTool: New scrollHeight: {new_height}")
|
41 |
+
if new_height == last_height:
|
42 |
+
print("StealthScrapeTool: ScrollHeight unchanged. Breaking scroll loop.")
|
43 |
+
break
|
44 |
+
last_height = new_height
|
45 |
+
scroll_attempts += 1
|
46 |
+
print("StealthScrapeTool: Finished scrolling.")
|
47 |
+
|
48 |
+
print(f"StealthScrapeTool: Page loaded. Attempting to find element with selector '{css_element}'")
|
49 |
+
|
50 |
+
# Element waiting logic
|
51 |
+
selectors_to_wait_for = []
|
52 |
+
if wait_for_selectors:
|
53 |
+
print("StealthScrapeTool: Additional selectors to wait for provided.")
|
54 |
+
selectors_to_wait_for.extend(wait_for_selectors)
|
55 |
+
|
56 |
+
# Always include css_element in the list of selectors to wait for
|
57 |
+
selectors_to_wait_for.append(css_element)
|
58 |
+
|
59 |
+
combined_selector = ", ".join(selectors_to_wait_for)
|
60 |
+
print(f"StealthScrapeTool: Waiting for selectors: {combined_selector}")
|
61 |
+
await page.wait_for_selector(combined_selector, timeout=60000, state='attached')
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
+
print("StealthScrapeTool: Required elements found. Extracting content...")
|
65 |
html_content = await page.content()
|
66 |
soup = BeautifulSoup(html_content, 'html.parser')
|
67 |
+
|
68 |
+
# Debug print to confirm if waited-for elements are in the scraped content
|
69 |
+
if soup.select_one("#all-reviews"):
|
70 |
+
print("StealthScrapeTool: #all-reviews found in scraped content.")
|
71 |
+
else:
|
72 |
+
print("StealthScrapeTool: #all-reviews NOT found in scraped content.")
|
73 |
+
|
74 |
target_element = soup.select_one(css_element)
|
75 |
if target_element:
|
76 |
+
# Clean the HTML content
|
77 |
+
print(f"Successfully found element with selector '{css_element}'. Cleaning content...")
|
78 |
+
for script in target_element.find_all("script"):
|
79 |
+
script.decompose()
|
80 |
+
for style_tag in target_element.find_all("style"):
|
81 |
+
style_tag.decompose()
|
82 |
+
for img in target_element.find_all("img"):
|
83 |
+
img.decompose()
|
84 |
+
for svg in target_element.find_all("svg"):
|
85 |
+
svg.decompose()
|
86 |
+
for iframe in target_element.find_all("iframe"):
|
87 |
+
iframe.decompose()
|
88 |
+
for source_tag in target_element.find_all("source"):
|
89 |
+
source_tag.decompose()
|
90 |
+
|
91 |
+
# Remove style attributes from all tags
|
92 |
+
for tag in target_element.find_all(True):
|
93 |
+
if 'style' in tag.attrs:
|
94 |
+
del tag['style']
|
95 |
+
|
96 |
return target_element.prettify()
|
97 |
else:
|
98 |
return f"Error: Could not find element with selector '{css_element}' on the page."
|
99 |
except Exception as e:
|
100 |
return f"Error during stealth web scraping: {e}"
|
101 |
|
102 |
+
def _run(self, website_url: str, css_element: str, wait_for_selectors: list[str] = None) -> str:
|
103 |
# This method is for synchronous execution, which is not ideal for Playwright.
|
104 |
# CrewAI typically calls _arun for async tools.
|
105 |
# For simplicity, we'll just call the async version here.
|
106 |
+
return asyncio.run(self._arun(website_url, css_element, wait_for_selectors))
|