start
Browse files- Dockerfile +33 -0
- README.md +1 -7
- app.py +255 -0
- main.py +229 -0
- requirements.txt +12 -0
Dockerfile
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use an official Python runtime as a parent image
|
2 |
+
#FROM python:3.9-slim
|
3 |
+
|
4 |
+
# Set the working directory in the container
|
5 |
+
#WORKDIR /app
|
6 |
+
|
7 |
+
# Copy the requirements file into the container
|
8 |
+
#COPY requirements.txt requirements.txt
|
9 |
+
|
10 |
+
# Install any needed packages specified in requirements.txt
|
11 |
+
#RUN pip install --no-cache-dir -r requirements.txt
|
12 |
+
|
13 |
+
# Copy the rest of the application's code into the container
|
14 |
+
#COPY . .
|
15 |
+
|
16 |
+
# Command to run the app
|
17 |
+
# The app must listen on port 7860 in Hugging Face Spaces
|
18 |
+
#CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
19 |
+
|
20 |
+
|
21 |
+
FROM python:3.9
|
22 |
+
|
23 |
+
RUN useradd -m -u 1000 user
|
24 |
+
USER user
|
25 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
26 |
+
|
27 |
+
WORKDIR /app
|
28 |
+
|
29 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
30 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
31 |
+
|
32 |
+
COPY --chown=user . /app
|
33 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
@@ -1,11 +1,5 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji: 🏆
|
4 |
-
colorFrom: gray
|
5 |
-
colorTo: yellow
|
6 |
-
sdk: docker
|
7 |
-
pinned: false
|
8 |
-
short_description: ArtVendorScrapperPythonDocker
|
9 |
---
|
10 |
|
11 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: ArtVendorScrapperPython
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
---
|
4 |
|
5 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,255 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""
|
3 |
+
NOTE 1: Start Command starting a FastAPI on render:
|
4 |
+
@see https://community.render.com/t/fastapi-python-web-service-deploying-for-hours/6662
|
5 |
+
uvicorn app:app --host 0.0.0.0 --port 10000
|
6 |
+
|
7 |
+
|
8 |
+
"""
|
9 |
+
|
10 |
+
import os , sys
|
11 |
+
import datetime , requests , random , logging , time , timeit
|
12 |
+
import simplejson as json
|
13 |
+
from fastapi import FastAPI , Request, HTTPException
|
14 |
+
from fastapi.responses import PlainTextResponse , HTMLResponse , Response , JSONResponse
|
15 |
+
# from fastapi import Request
|
16 |
+
from starlette.requests import Request
|
17 |
+
|
18 |
+
from bs4 import BeautifulSoup
|
19 |
+
from furl import furl
|
20 |
+
# from apscheduler.schedulers.blocking import BlockingScheduler
|
21 |
+
# from apscheduler.schedulers.background import BackgroundScheduler
|
22 |
+
from pymongo import MongoClient
|
23 |
+
import fire
|
24 |
+
import socket
|
25 |
+
import requests
|
26 |
+
from functools import wraps
|
27 |
+
|
28 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
29 |
+
|
30 |
+
HOSTNAME = socket.gethostname()
|
31 |
+
|
32 |
+
USER_AGENTS = [
|
33 |
+
"Mozilla/5.0 (Windows NT 6.2; WOW64; rv:63.0) Gecko/20100101 Firefox/63.0",
|
34 |
+
"Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0",
|
35 |
+
"Mozilla/5.0 (X11; Linux; rv:74.0) Gecko/20100101 Firefox/74.0",
|
36 |
+
"Mozilla/5.0 (X11; Linux ppc64le; rv:75.0) Gecko/20100101 Firefox/75.0",
|
37 |
+
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:77.0) Gecko/20190101 Firefox/77.0"
|
38 |
+
]
|
39 |
+
|
40 |
+
BOT_AGENTS = [
|
41 |
+
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
|
42 |
+
"Googlebot/2.1 (+http://www.googlebot.com/bot.html)",
|
43 |
+
"Mozilla/5.0 (compatible; bingbot/2.0 +http://www.bing.com/bingbot.htm)",
|
44 |
+
"Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)",
|
45 |
+
"DuckDuckBot/1.0; (+http://duckduckgo.com/duckduckbot.html)",
|
46 |
+
"Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)",
|
47 |
+
"Mozilla/5.0 (compatible; Exabot/3.0; +http://www.exabot.com/go/robot)"
|
48 |
+
]
|
49 |
+
|
50 |
+
# MONGODB-ATLAS SETUP
|
51 |
+
MONGOATLAS_URI = os.environ.get('MONGOATLAS_URI') or None
|
52 |
+
AK = os.environ.get('AK') or None
|
53 |
+
|
54 |
+
##############################################################################
|
55 |
+
#
|
56 |
+
# LOGGING
|
57 |
+
#
|
58 |
+
##############################################################################
|
59 |
+
|
60 |
+
logging.basicConfig(level=logging.INFO , format='%(message)s')
|
61 |
+
logging.getLogger("requests").setLevel(logging.ERROR)
|
62 |
+
|
63 |
+
logger = logging.getLogger(__name__)
|
64 |
+
logger.setLevel(logging.DEBUG)
|
65 |
+
logger.propagate=False
|
66 |
+
|
67 |
+
console_logger = logging.StreamHandler()
|
68 |
+
console_logger.setLevel(logging.DEBUG)
|
69 |
+
console_logger.setFormatter(logging.Formatter('%(message)s'))
|
70 |
+
|
71 |
+
logger.addHandler(console_logger)
|
72 |
+
|
73 |
+
if not MONGOATLAS_URI:
|
74 |
+
logger.warning('Could not read the database URI')
|
75 |
+
|
76 |
+
if not MONGOATLAS_URI:
|
77 |
+
logger.warning('Could not read the access key')
|
78 |
+
|
79 |
+
|
80 |
+
# Disable urllib3 warnings (sent by requests)
|
81 |
+
# requests.packages.urllib3.disable_warnings()
|
82 |
+
|
83 |
+
app = FastAPI(docs_url=None, redoc_url=None, openapi_url=None)
|
84 |
+
#app.config.from_pyfile('flaskapp.cfg')
|
85 |
+
|
86 |
+
port = 5000
|
87 |
+
scheduler = None
|
88 |
+
proxies = {}
|
89 |
+
# local_ip = socket.gethostbyname(hostname)
|
90 |
+
|
91 |
+
# Custom decorator to check for the access key
|
92 |
+
def require_access_key(func):
|
93 |
+
@wraps(func)
|
94 |
+
def wrapper(*args, **kwargs):
|
95 |
+
request = kwargs.get('request') # Get the 'request' object from the endpoint's kwargs
|
96 |
+
access_key = request.query_params.get('AK')
|
97 |
+
|
98 |
+
# Check if the provided ACCESS_KEY matches the secret
|
99 |
+
if access_key != AK:
|
100 |
+
return PlainTextResponse("ERROR: Unauthorized call" , status_code=401)
|
101 |
+
|
102 |
+
return func(*args, **kwargs) # Call the actual endpoint function
|
103 |
+
return wrapper
|
104 |
+
|
105 |
+
@app.get('/')
|
106 |
+
def index():
|
107 |
+
#return render_template('index.html')
|
108 |
+
logger.info(f'hostname: {HOSTNAME}')
|
109 |
+
return PlainTextResponse('OK' , 200)
|
110 |
+
|
111 |
+
@app.get('/ping')
|
112 |
+
def index():
|
113 |
+
return PlainTextResponse('PONG' , 200)
|
114 |
+
|
115 |
+
@app.get("/remote_ip")
|
116 |
+
@require_access_key
|
117 |
+
def remote_ip(request:Request):
|
118 |
+
client_host = request.client.host
|
119 |
+
return PlainTextResponse(client_host , 200)
|
120 |
+
|
121 |
+
@app.get("/task/faa_scrap_sold_listings_featured")
|
122 |
+
@require_access_key
|
123 |
+
def faa_scrap_sold_listings_featured_local(request:Request):
|
124 |
+
|
125 |
+
global proxies
|
126 |
+
|
127 |
+
timeit_request = 0
|
128 |
+
timeit_parsing = 0
|
129 |
+
timeit_mongo = 0
|
130 |
+
|
131 |
+
response_body = '?'
|
132 |
+
|
133 |
+
if not MONGOATLAS_URI:
|
134 |
+
return PlainTextResponse("ERROR: MONGOATLAS_URI is undefined" , status_code=500)
|
135 |
+
|
136 |
+
#access_key = request.query_params['AK']
|
137 |
+
#if access_key != AK:
|
138 |
+
# return PlainTextResponse("ERROR: Unauthorized call" , status_code=401)
|
139 |
+
|
140 |
+
|
141 |
+
cnt_dbs = 4
|
142 |
+
|
143 |
+
headers = {
|
144 |
+
'User-Agent': random.choice(USER_AGENTS)
|
145 |
+
}
|
146 |
+
|
147 |
+
site_url = "https://fineartamerica.com/recentprintsales.html?rand={}".format(random.randint(1000,1000000))
|
148 |
+
r=None
|
149 |
+
|
150 |
+
try:
|
151 |
+
start = time.time()
|
152 |
+
r = requests.get(site_url , proxies=proxies , timeout=30 , verify=False , headers=headers)
|
153 |
+
timeit_request = time.time()-start
|
154 |
+
except Exception as e:
|
155 |
+
response_body = str(e)
|
156 |
+
|
157 |
+
if r and r.status_code==200:
|
158 |
+
|
159 |
+
try:
|
160 |
+
|
161 |
+
start = time.time()
|
162 |
+
listings = parse_faa_sold_listings_page(r.text)
|
163 |
+
timeit_parsing = time.time() - start
|
164 |
+
|
165 |
+
d = dict()
|
166 |
+
d['date_utc'] = datetime.datetime.utcnow()
|
167 |
+
d['results'] = listings
|
168 |
+
d['processed']= False
|
169 |
+
|
170 |
+
status = "ok"
|
171 |
+
|
172 |
+
db_name = 'faa_scrap_' + str(random.randint(1,cnt_dbs))
|
173 |
+
col_name = 'faa_sl'
|
174 |
+
|
175 |
+
mongo_client = None
|
176 |
+
try:
|
177 |
+
start = time.time()
|
178 |
+
mongo_client = MongoClient(MONGOATLAS_URI)
|
179 |
+
db = mongo_client[db_name]
|
180 |
+
col = db[col_name]
|
181 |
+
r = col.insert_one(d)
|
182 |
+
timeit_mongo = time.time() - start
|
183 |
+
except Exception as e:
|
184 |
+
status = "error saving to mongodb ({})".format(str(e))
|
185 |
+
logging.error(status)
|
186 |
+
finally:
|
187 |
+
try:
|
188 |
+
mongo_client.close()
|
189 |
+
except Exception:
|
190 |
+
pass
|
191 |
+
|
192 |
+
|
193 |
+
o = dict()
|
194 |
+
o['site']="faa"
|
195 |
+
o['status']=status
|
196 |
+
o['date'] = d['date_utc']
|
197 |
+
o['results_count'] = len(listings)
|
198 |
+
o['db_name'] = db_name
|
199 |
+
o['timeit'] = {'request':timeit_request,
|
200 |
+
'parsing':timeit_parsing,
|
201 |
+
'db':timeit_mongo}
|
202 |
+
# o['proxy'] = json.dumps(proxies)
|
203 |
+
|
204 |
+
response_body = str(o)
|
205 |
+
|
206 |
+
except Exception as e:
|
207 |
+
response_body = str(e)
|
208 |
+
|
209 |
+
return PlainTextResponse(response_body, 200)
|
210 |
+
|
211 |
+
|
212 |
+
def parse_faa_sold_listings_page(html):
|
213 |
+
|
214 |
+
soup = BeautifulSoup(html , 'lxml') # "html.parser"
|
215 |
+
|
216 |
+
listings_els = soup.find_all('div' , {'class':'productImageDiv'})
|
217 |
+
|
218 |
+
listings = []
|
219 |
+
|
220 |
+
for i,listing_el in enumerate(listings_els):
|
221 |
+
|
222 |
+
#if listing_el['style'].find('hidden') > -1:
|
223 |
+
# continue
|
224 |
+
|
225 |
+
l = dict()
|
226 |
+
|
227 |
+
item_url = listing_el.find('a')['href']
|
228 |
+
if not item_url.startswith('http'):
|
229 |
+
item_url = 'https://fineartamerica.com/' + item_url
|
230 |
+
|
231 |
+
item_page = furl(item_url)
|
232 |
+
item_page.path.normalize()
|
233 |
+
l['item_page'] = item_page.url
|
234 |
+
|
235 |
+
l['image'] = listing_el.find('img' , {'class':'productImage'})['src']
|
236 |
+
|
237 |
+
artist_url = listing_el.find('p',{'class':'artistName'}).a['href']
|
238 |
+
if not artist_url.startswith('http'):
|
239 |
+
artist_url = 'https://fineartamerica.com/' + artist_url
|
240 |
+
artist_page = furl(artist_url)
|
241 |
+
artist_page.path.normalize()
|
242 |
+
l['artist_page'] = artist_page.url
|
243 |
+
|
244 |
+
l['artist'] = listing_el.find('p',{'class':'artistName'}).text
|
245 |
+
l['sell_info'] = listing_el.find('p' , {'class':'orderLocation'}).text
|
246 |
+
|
247 |
+
listings.append(l)
|
248 |
+
|
249 |
+
del soup
|
250 |
+
|
251 |
+
return listings
|
252 |
+
|
253 |
+
#if __name__ == "__main__":
|
254 |
+
# import uvicorn
|
255 |
+
# uvicorn.run(app, host="0.0.0.0", port=7860)
|
main.py
ADDED
@@ -0,0 +1,229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""
|
3 |
+
NOTE 1: Start Command starting a FastAPI on render:
|
4 |
+
@see https://community.render.com/t/fastapi-python-web-service-deploying-for-hours/6662
|
5 |
+
uvicorn app:app --host 0.0.0.0 --port 10000
|
6 |
+
|
7 |
+
|
8 |
+
"""
|
9 |
+
|
10 |
+
import os , sys
|
11 |
+
import datetime , requests , random , logging , time , timeit
|
12 |
+
import simplejson as json
|
13 |
+
from fastapi import FastAPI
|
14 |
+
from fastapi.responses import PlainTextResponse , HTMLResponse , Response , JSONResponse
|
15 |
+
# from fastapi import Request
|
16 |
+
from starlette.requests import Request
|
17 |
+
|
18 |
+
from bs4 import BeautifulSoup
|
19 |
+
from furl import furl
|
20 |
+
# from apscheduler.schedulers.blocking import BlockingScheduler
|
21 |
+
# from apscheduler.schedulers.background import BackgroundScheduler
|
22 |
+
from pymongo import MongoClient
|
23 |
+
import fire
|
24 |
+
import socket
|
25 |
+
import requests
|
26 |
+
|
27 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
28 |
+
|
29 |
+
HOSTNAME = socket.gethostname()
|
30 |
+
|
31 |
+
USER_AGENTS = [
|
32 |
+
"Mozilla/5.0 (Windows NT 6.2; WOW64; rv:63.0) Gecko/20100101 Firefox/63.0",
|
33 |
+
"Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0",
|
34 |
+
"Mozilla/5.0 (X11; Linux; rv:74.0) Gecko/20100101 Firefox/74.0",
|
35 |
+
"Mozilla/5.0 (X11; Linux ppc64le; rv:75.0) Gecko/20100101 Firefox/75.0",
|
36 |
+
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:77.0) Gecko/20190101 Firefox/77.0"
|
37 |
+
]
|
38 |
+
|
39 |
+
BOT_AGENTS = [
|
40 |
+
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
|
41 |
+
"Googlebot/2.1 (+http://www.googlebot.com/bot.html)",
|
42 |
+
"Mozilla/5.0 (compatible; bingbot/2.0 +http://www.bing.com/bingbot.htm)",
|
43 |
+
"Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)",
|
44 |
+
"DuckDuckBot/1.0; (+http://duckduckgo.com/duckduckbot.html)",
|
45 |
+
"Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)",
|
46 |
+
"Mozilla/5.0 (compatible; Exabot/3.0; +http://www.exabot.com/go/robot)"
|
47 |
+
]
|
48 |
+
|
49 |
+
# MONGODB-ATLAS SETUP
|
50 |
+
MONGOATLAS_URI = os.environ.get('MONGOATLAS_URI')
|
51 |
+
|
52 |
+
##############################################################################
|
53 |
+
#
|
54 |
+
# LOGGING
|
55 |
+
#
|
56 |
+
##############################################################################
|
57 |
+
|
58 |
+
logging.basicConfig(level=logging.INFO , format='%(message)s')
|
59 |
+
logging.getLogger("requests").setLevel(logging.ERROR)
|
60 |
+
|
61 |
+
logger = logging.getLogger(__name__)
|
62 |
+
logger.setLevel(logging.DEBUG)
|
63 |
+
logger.propagate=False
|
64 |
+
|
65 |
+
console_logger = logging.StreamHandler()
|
66 |
+
console_logger.setLevel(logging.DEBUG)
|
67 |
+
console_logger.setFormatter(logging.Formatter('%(message)s'))
|
68 |
+
|
69 |
+
logger.addHandler(console_logger)
|
70 |
+
|
71 |
+
# Disable urllib3 warnings (sent by requests)
|
72 |
+
# requests.packages.urllib3.disable_warnings()
|
73 |
+
|
74 |
+
app = FastAPI()
|
75 |
+
#app.config.from_pyfile('flaskapp.cfg')
|
76 |
+
|
77 |
+
port = 5000
|
78 |
+
scheduler = None
|
79 |
+
proxies = {}
|
80 |
+
# local_ip = socket.gethostbyname(hostname)
|
81 |
+
if HOSTNAME == 'OCTOCORE':
|
82 |
+
#proxies = {'http': 'http://192.168.1.68:80', 'https': 'http://192.168.1.68:80'}
|
83 |
+
proxies = {'http': 'https://anonyland:[email protected]:8080', 'https': 'http://anonyland:[email protected]:8080'}
|
84 |
+
proxy_ip = '192.168.1.43:80'
|
85 |
+
|
86 |
+
@app.get('/')
|
87 |
+
def index():
|
88 |
+
#return render_template('index.html')
|
89 |
+
logger.info(f'hostname: {HOSTNAME}')
|
90 |
+
return PlainTextResponse('OK' , 200)
|
91 |
+
|
92 |
+
@app.get('/ping')
|
93 |
+
def index():
|
94 |
+
return Response(status_code=200)
|
95 |
+
|
96 |
+
@app.get("/remote_ip")
|
97 |
+
def remote_ip(request:Request):
|
98 |
+
client_host = request.client.host
|
99 |
+
return PlainTextResponse(client_host , 200)
|
100 |
+
|
101 |
+
@app.get("/task/faa_scrap_sold_listings_featured")
|
102 |
+
def faa_scrap_sold_listings_featured_local():
|
103 |
+
|
104 |
+
global proxies
|
105 |
+
|
106 |
+
timeit_request = 0
|
107 |
+
timeit_parsing = 0
|
108 |
+
timeit_mongo = 0
|
109 |
+
|
110 |
+
response_body = '?'
|
111 |
+
|
112 |
+
if not MONGOATLAS_URI:
|
113 |
+
return PlainTextResponse("ERROR: MONGOATLAS_URI is undefined" , status_code=500)
|
114 |
+
|
115 |
+
cnt_dbs = 4
|
116 |
+
|
117 |
+
headers = {
|
118 |
+
'User-Agent': random.choice(USER_AGENTS)
|
119 |
+
}
|
120 |
+
|
121 |
+
site_url = "https://fineartamerica.com/recentprintsales.html?rand={}".format(random.randint(1000,1000000))
|
122 |
+
r=None
|
123 |
+
|
124 |
+
try:
|
125 |
+
start = time.time()
|
126 |
+
r = requests.get(site_url , proxies=proxies , timeout=30 , verify=False , headers=headers)
|
127 |
+
timeit_request = time.time()-start
|
128 |
+
except Exception as e:
|
129 |
+
response_body = str(e)
|
130 |
+
|
131 |
+
if r and r.status_code==200:
|
132 |
+
|
133 |
+
try:
|
134 |
+
|
135 |
+
start = time.time()
|
136 |
+
listings = parse_faa_sold_listings_page(r.text)
|
137 |
+
timeit_parsing = time.time() - start
|
138 |
+
|
139 |
+
d = dict()
|
140 |
+
d['date_utc'] = datetime.datetime.utcnow()
|
141 |
+
d['results'] = listings
|
142 |
+
d['processed']= False
|
143 |
+
|
144 |
+
status = "ok"
|
145 |
+
|
146 |
+
db_name = 'faa_scrap_' + str(random.randint(1,cnt_dbs))
|
147 |
+
col_name = 'faa_sl'
|
148 |
+
|
149 |
+
mongo_client = None
|
150 |
+
try:
|
151 |
+
start = time.time()
|
152 |
+
mongo_client = MongoClient(MONGOATLAS_URI)
|
153 |
+
db = mongo_client[db_name]
|
154 |
+
col = db[col_name]
|
155 |
+
r = col.insert_one(d)
|
156 |
+
timeit_mongo = time.time() - start
|
157 |
+
except Exception as e:
|
158 |
+
status = "error saving to mongodb ({})".format(str(e))
|
159 |
+
logging.error(status)
|
160 |
+
finally:
|
161 |
+
try:
|
162 |
+
mongo_client.close()
|
163 |
+
except Exception:
|
164 |
+
pass
|
165 |
+
|
166 |
+
|
167 |
+
o = dict()
|
168 |
+
o['site']="faa"
|
169 |
+
o['status']=status
|
170 |
+
o['date'] = d['date_utc']
|
171 |
+
o['results_count'] = len(listings)
|
172 |
+
o['db_name'] = db_name
|
173 |
+
o['timeit'] = {'request':timeit_request,
|
174 |
+
'parsing':timeit_parsing,
|
175 |
+
'db':timeit_mongo}
|
176 |
+
# o['proxy'] = json.dumps(proxies)
|
177 |
+
|
178 |
+
response_body = str(o)
|
179 |
+
|
180 |
+
except Exception as e:
|
181 |
+
response_body = str(e)
|
182 |
+
|
183 |
+
return PlainTextResponse(response_body, 200)
|
184 |
+
|
185 |
+
|
186 |
+
def parse_faa_sold_listings_page(html):
|
187 |
+
|
188 |
+
soup = BeautifulSoup(html , 'lxml') # "html.parser"
|
189 |
+
|
190 |
+
listings_els = soup.find_all('div' , {'class':'productImageDiv'})
|
191 |
+
|
192 |
+
listings = []
|
193 |
+
|
194 |
+
for i,listing_el in enumerate(listings_els):
|
195 |
+
|
196 |
+
#if listing_el['style'].find('hidden') > -1:
|
197 |
+
# continue
|
198 |
+
|
199 |
+
l = dict()
|
200 |
+
|
201 |
+
item_url = listing_el.find('a')['href']
|
202 |
+
if not item_url.startswith('http'):
|
203 |
+
item_url = 'https://fineartamerica.com/' + item_url
|
204 |
+
|
205 |
+
item_page = furl(item_url)
|
206 |
+
item_page.path.normalize()
|
207 |
+
l['item_page'] = item_page.url
|
208 |
+
|
209 |
+
l['image'] = listing_el.find('img' , {'class':'productImage'})['src']
|
210 |
+
|
211 |
+
artist_url = listing_el.find('p',{'class':'artistName'}).a['href']
|
212 |
+
if not artist_url.startswith('http'):
|
213 |
+
artist_url = 'https://fineartamerica.com/' + artist_url
|
214 |
+
artist_page = furl(artist_url)
|
215 |
+
artist_page.path.normalize()
|
216 |
+
l['artist_page'] = artist_page.url
|
217 |
+
|
218 |
+
l['artist'] = listing_el.find('p',{'class':'artistName'}).text
|
219 |
+
l['sell_info'] = listing_el.find('p' , {'class':'orderLocation'}).text
|
220 |
+
|
221 |
+
listings.append(l)
|
222 |
+
|
223 |
+
del soup
|
224 |
+
|
225 |
+
return listings
|
226 |
+
|
227 |
+
if __name__ == "__main__":
|
228 |
+
import uvicorn
|
229 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
uvicorn[standard]
|
2 |
+
fastapi
|
3 |
+
requests
|
4 |
+
pymongo
|
5 |
+
simplejson
|
6 |
+
pytz
|
7 |
+
beautifulsoup4
|
8 |
+
furl
|
9 |
+
lxml
|
10 |
+
apscheduler
|
11 |
+
fire
|
12 |
+
fake-useragent
|