menorki commited on
Commit
776f3ff
·
1 Parent(s): 9ca1718
Files changed (5) hide show
  1. Dockerfile +33 -0
  2. README.md +1 -7
  3. app.py +255 -0
  4. main.py +229 -0
  5. requirements.txt +12 -0
Dockerfile ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as a parent image
2
+ #FROM python:3.9-slim
3
+
4
+ # Set the working directory in the container
5
+ #WORKDIR /app
6
+
7
+ # Copy the requirements file into the container
8
+ #COPY requirements.txt requirements.txt
9
+
10
+ # Install any needed packages specified in requirements.txt
11
+ #RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+ # Copy the rest of the application's code into the container
14
+ #COPY . .
15
+
16
+ # Command to run the app
17
+ # The app must listen on port 7860 in Hugging Face Spaces
18
+ #CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
19
+
20
+
21
+ FROM python:3.9
22
+
23
+ RUN useradd -m -u 1000 user
24
+ USER user
25
+ ENV PATH="/home/user/.local/bin:$PATH"
26
+
27
+ WORKDIR /app
28
+
29
+ COPY --chown=user ./requirements.txt requirements.txt
30
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
31
+
32
+ COPY --chown=user . /app
33
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,11 +1,5 @@
1
  ---
2
- title: ArtVendorScrapperPythonDocker
3
- emoji: 🏆
4
- colorFrom: gray
5
- colorTo: yellow
6
- sdk: docker
7
- pinned: false
8
- short_description: ArtVendorScrapperPythonDocker
9
  ---
10
 
11
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: ArtVendorScrapperPython
 
 
 
 
 
 
3
  ---
4
 
5
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ NOTE 1: Start Command starting a FastAPI on render:
4
+ @see https://community.render.com/t/fastapi-python-web-service-deploying-for-hours/6662
5
+ uvicorn app:app --host 0.0.0.0 --port 10000
6
+
7
+
8
+ """
9
+
10
+ import os , sys
11
+ import datetime , requests , random , logging , time , timeit
12
+ import simplejson as json
13
+ from fastapi import FastAPI , Request, HTTPException
14
+ from fastapi.responses import PlainTextResponse , HTMLResponse , Response , JSONResponse
15
+ # from fastapi import Request
16
+ from starlette.requests import Request
17
+
18
+ from bs4 import BeautifulSoup
19
+ from furl import furl
20
+ # from apscheduler.schedulers.blocking import BlockingScheduler
21
+ # from apscheduler.schedulers.background import BackgroundScheduler
22
+ from pymongo import MongoClient
23
+ import fire
24
+ import socket
25
+ import requests
26
+ from functools import wraps
27
+
28
+ from apscheduler.schedulers.background import BackgroundScheduler
29
+
30
+ HOSTNAME = socket.gethostname()
31
+
32
+ USER_AGENTS = [
33
+ "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:63.0) Gecko/20100101 Firefox/63.0",
34
+ "Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0",
35
+ "Mozilla/5.0 (X11; Linux; rv:74.0) Gecko/20100101 Firefox/74.0",
36
+ "Mozilla/5.0 (X11; Linux ppc64le; rv:75.0) Gecko/20100101 Firefox/75.0",
37
+ "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:77.0) Gecko/20190101 Firefox/77.0"
38
+ ]
39
+
40
+ BOT_AGENTS = [
41
+ "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
42
+ "Googlebot/2.1 (+http://www.googlebot.com/bot.html)",
43
+ "Mozilla/5.0 (compatible; bingbot/2.0 +http://www.bing.com/bingbot.htm)",
44
+ "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)",
45
+ "DuckDuckBot/1.0; (+http://duckduckgo.com/duckduckbot.html)",
46
+ "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)",
47
+ "Mozilla/5.0 (compatible; Exabot/3.0; +http://www.exabot.com/go/robot)"
48
+ ]
49
+
50
+ # MONGODB-ATLAS SETUP
51
+ MONGOATLAS_URI = os.environ.get('MONGOATLAS_URI') or None
52
+ AK = os.environ.get('AK') or None
53
+
54
+ ##############################################################################
55
+ #
56
+ # LOGGING
57
+ #
58
+ ##############################################################################
59
+
60
+ logging.basicConfig(level=logging.INFO , format='%(message)s')
61
+ logging.getLogger("requests").setLevel(logging.ERROR)
62
+
63
+ logger = logging.getLogger(__name__)
64
+ logger.setLevel(logging.DEBUG)
65
+ logger.propagate=False
66
+
67
+ console_logger = logging.StreamHandler()
68
+ console_logger.setLevel(logging.DEBUG)
69
+ console_logger.setFormatter(logging.Formatter('%(message)s'))
70
+
71
+ logger.addHandler(console_logger)
72
+
73
+ if not MONGOATLAS_URI:
74
+ logger.warning('Could not read the database URI')
75
+
76
+ if not MONGOATLAS_URI:
77
+ logger.warning('Could not read the access key')
78
+
79
+
80
+ # Disable urllib3 warnings (sent by requests)
81
+ # requests.packages.urllib3.disable_warnings()
82
+
83
+ app = FastAPI(docs_url=None, redoc_url=None, openapi_url=None)
84
+ #app.config.from_pyfile('flaskapp.cfg')
85
+
86
+ port = 5000
87
+ scheduler = None
88
+ proxies = {}
89
+ # local_ip = socket.gethostbyname(hostname)
90
+
91
+ # Custom decorator to check for the access key
92
+ def require_access_key(func):
93
+ @wraps(func)
94
+ def wrapper(*args, **kwargs):
95
+ request = kwargs.get('request') # Get the 'request' object from the endpoint's kwargs
96
+ access_key = request.query_params.get('AK')
97
+
98
+ # Check if the provided ACCESS_KEY matches the secret
99
+ if access_key != AK:
100
+ return PlainTextResponse("ERROR: Unauthorized call" , status_code=401)
101
+
102
+ return func(*args, **kwargs) # Call the actual endpoint function
103
+ return wrapper
104
+
105
+ @app.get('/')
106
+ def index():
107
+ #return render_template('index.html')
108
+ logger.info(f'hostname: {HOSTNAME}')
109
+ return PlainTextResponse('OK' , 200)
110
+
111
+ @app.get('/ping')
112
+ def index():
113
+ return PlainTextResponse('PONG' , 200)
114
+
115
+ @app.get("/remote_ip")
116
+ @require_access_key
117
+ def remote_ip(request:Request):
118
+ client_host = request.client.host
119
+ return PlainTextResponse(client_host , 200)
120
+
121
+ @app.get("/task/faa_scrap_sold_listings_featured")
122
+ @require_access_key
123
+ def faa_scrap_sold_listings_featured_local(request:Request):
124
+
125
+ global proxies
126
+
127
+ timeit_request = 0
128
+ timeit_parsing = 0
129
+ timeit_mongo = 0
130
+
131
+ response_body = '?'
132
+
133
+ if not MONGOATLAS_URI:
134
+ return PlainTextResponse("ERROR: MONGOATLAS_URI is undefined" , status_code=500)
135
+
136
+ #access_key = request.query_params['AK']
137
+ #if access_key != AK:
138
+ # return PlainTextResponse("ERROR: Unauthorized call" , status_code=401)
139
+
140
+
141
+ cnt_dbs = 4
142
+
143
+ headers = {
144
+ 'User-Agent': random.choice(USER_AGENTS)
145
+ }
146
+
147
+ site_url = "https://fineartamerica.com/recentprintsales.html?rand={}".format(random.randint(1000,1000000))
148
+ r=None
149
+
150
+ try:
151
+ start = time.time()
152
+ r = requests.get(site_url , proxies=proxies , timeout=30 , verify=False , headers=headers)
153
+ timeit_request = time.time()-start
154
+ except Exception as e:
155
+ response_body = str(e)
156
+
157
+ if r and r.status_code==200:
158
+
159
+ try:
160
+
161
+ start = time.time()
162
+ listings = parse_faa_sold_listings_page(r.text)
163
+ timeit_parsing = time.time() - start
164
+
165
+ d = dict()
166
+ d['date_utc'] = datetime.datetime.utcnow()
167
+ d['results'] = listings
168
+ d['processed']= False
169
+
170
+ status = "ok"
171
+
172
+ db_name = 'faa_scrap_' + str(random.randint(1,cnt_dbs))
173
+ col_name = 'faa_sl'
174
+
175
+ mongo_client = None
176
+ try:
177
+ start = time.time()
178
+ mongo_client = MongoClient(MONGOATLAS_URI)
179
+ db = mongo_client[db_name]
180
+ col = db[col_name]
181
+ r = col.insert_one(d)
182
+ timeit_mongo = time.time() - start
183
+ except Exception as e:
184
+ status = "error saving to mongodb ({})".format(str(e))
185
+ logging.error(status)
186
+ finally:
187
+ try:
188
+ mongo_client.close()
189
+ except Exception:
190
+ pass
191
+
192
+
193
+ o = dict()
194
+ o['site']="faa"
195
+ o['status']=status
196
+ o['date'] = d['date_utc']
197
+ o['results_count'] = len(listings)
198
+ o['db_name'] = db_name
199
+ o['timeit'] = {'request':timeit_request,
200
+ 'parsing':timeit_parsing,
201
+ 'db':timeit_mongo}
202
+ # o['proxy'] = json.dumps(proxies)
203
+
204
+ response_body = str(o)
205
+
206
+ except Exception as e:
207
+ response_body = str(e)
208
+
209
+ return PlainTextResponse(response_body, 200)
210
+
211
+
212
+ def parse_faa_sold_listings_page(html):
213
+
214
+ soup = BeautifulSoup(html , 'lxml') # "html.parser"
215
+
216
+ listings_els = soup.find_all('div' , {'class':'productImageDiv'})
217
+
218
+ listings = []
219
+
220
+ for i,listing_el in enumerate(listings_els):
221
+
222
+ #if listing_el['style'].find('hidden') > -1:
223
+ # continue
224
+
225
+ l = dict()
226
+
227
+ item_url = listing_el.find('a')['href']
228
+ if not item_url.startswith('http'):
229
+ item_url = 'https://fineartamerica.com/' + item_url
230
+
231
+ item_page = furl(item_url)
232
+ item_page.path.normalize()
233
+ l['item_page'] = item_page.url
234
+
235
+ l['image'] = listing_el.find('img' , {'class':'productImage'})['src']
236
+
237
+ artist_url = listing_el.find('p',{'class':'artistName'}).a['href']
238
+ if not artist_url.startswith('http'):
239
+ artist_url = 'https://fineartamerica.com/' + artist_url
240
+ artist_page = furl(artist_url)
241
+ artist_page.path.normalize()
242
+ l['artist_page'] = artist_page.url
243
+
244
+ l['artist'] = listing_el.find('p',{'class':'artistName'}).text
245
+ l['sell_info'] = listing_el.find('p' , {'class':'orderLocation'}).text
246
+
247
+ listings.append(l)
248
+
249
+ del soup
250
+
251
+ return listings
252
+
253
+ #if __name__ == "__main__":
254
+ # import uvicorn
255
+ # uvicorn.run(app, host="0.0.0.0", port=7860)
main.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ NOTE 1: Start Command starting a FastAPI on render:
4
+ @see https://community.render.com/t/fastapi-python-web-service-deploying-for-hours/6662
5
+ uvicorn app:app --host 0.0.0.0 --port 10000
6
+
7
+
8
+ """
9
+
10
+ import os , sys
11
+ import datetime , requests , random , logging , time , timeit
12
+ import simplejson as json
13
+ from fastapi import FastAPI
14
+ from fastapi.responses import PlainTextResponse , HTMLResponse , Response , JSONResponse
15
+ # from fastapi import Request
16
+ from starlette.requests import Request
17
+
18
+ from bs4 import BeautifulSoup
19
+ from furl import furl
20
+ # from apscheduler.schedulers.blocking import BlockingScheduler
21
+ # from apscheduler.schedulers.background import BackgroundScheduler
22
+ from pymongo import MongoClient
23
+ import fire
24
+ import socket
25
+ import requests
26
+
27
+ from apscheduler.schedulers.background import BackgroundScheduler
28
+
29
+ HOSTNAME = socket.gethostname()
30
+
31
+ USER_AGENTS = [
32
+ "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:63.0) Gecko/20100101 Firefox/63.0",
33
+ "Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0",
34
+ "Mozilla/5.0 (X11; Linux; rv:74.0) Gecko/20100101 Firefox/74.0",
35
+ "Mozilla/5.0 (X11; Linux ppc64le; rv:75.0) Gecko/20100101 Firefox/75.0",
36
+ "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:77.0) Gecko/20190101 Firefox/77.0"
37
+ ]
38
+
39
+ BOT_AGENTS = [
40
+ "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
41
+ "Googlebot/2.1 (+http://www.googlebot.com/bot.html)",
42
+ "Mozilla/5.0 (compatible; bingbot/2.0 +http://www.bing.com/bingbot.htm)",
43
+ "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)",
44
+ "DuckDuckBot/1.0; (+http://duckduckgo.com/duckduckbot.html)",
45
+ "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)",
46
+ "Mozilla/5.0 (compatible; Exabot/3.0; +http://www.exabot.com/go/robot)"
47
+ ]
48
+
49
+ # MONGODB-ATLAS SETUP
50
+ MONGOATLAS_URI = os.environ.get('MONGOATLAS_URI')
51
+
52
+ ##############################################################################
53
+ #
54
+ # LOGGING
55
+ #
56
+ ##############################################################################
57
+
58
+ logging.basicConfig(level=logging.INFO , format='%(message)s')
59
+ logging.getLogger("requests").setLevel(logging.ERROR)
60
+
61
+ logger = logging.getLogger(__name__)
62
+ logger.setLevel(logging.DEBUG)
63
+ logger.propagate=False
64
+
65
+ console_logger = logging.StreamHandler()
66
+ console_logger.setLevel(logging.DEBUG)
67
+ console_logger.setFormatter(logging.Formatter('%(message)s'))
68
+
69
+ logger.addHandler(console_logger)
70
+
71
+ # Disable urllib3 warnings (sent by requests)
72
+ # requests.packages.urllib3.disable_warnings()
73
+
74
+ app = FastAPI()
75
+ #app.config.from_pyfile('flaskapp.cfg')
76
+
77
+ port = 5000
78
+ scheduler = None
79
+ proxies = {}
80
+ # local_ip = socket.gethostbyname(hostname)
81
+ if HOSTNAME == 'OCTOCORE':
82
+ #proxies = {'http': 'http://192.168.1.68:80', 'https': 'http://192.168.1.68:80'}
83
+ proxies = {'http': 'https://anonyland:[email protected]:8080', 'https': 'http://anonyland:[email protected]:8080'}
84
+ proxy_ip = '192.168.1.43:80'
85
+
86
+ @app.get('/')
87
+ def index():
88
+ #return render_template('index.html')
89
+ logger.info(f'hostname: {HOSTNAME}')
90
+ return PlainTextResponse('OK' , 200)
91
+
92
+ @app.get('/ping')
93
+ def index():
94
+ return Response(status_code=200)
95
+
96
+ @app.get("/remote_ip")
97
+ def remote_ip(request:Request):
98
+ client_host = request.client.host
99
+ return PlainTextResponse(client_host , 200)
100
+
101
+ @app.get("/task/faa_scrap_sold_listings_featured")
102
+ def faa_scrap_sold_listings_featured_local():
103
+
104
+ global proxies
105
+
106
+ timeit_request = 0
107
+ timeit_parsing = 0
108
+ timeit_mongo = 0
109
+
110
+ response_body = '?'
111
+
112
+ if not MONGOATLAS_URI:
113
+ return PlainTextResponse("ERROR: MONGOATLAS_URI is undefined" , status_code=500)
114
+
115
+ cnt_dbs = 4
116
+
117
+ headers = {
118
+ 'User-Agent': random.choice(USER_AGENTS)
119
+ }
120
+
121
+ site_url = "https://fineartamerica.com/recentprintsales.html?rand={}".format(random.randint(1000,1000000))
122
+ r=None
123
+
124
+ try:
125
+ start = time.time()
126
+ r = requests.get(site_url , proxies=proxies , timeout=30 , verify=False , headers=headers)
127
+ timeit_request = time.time()-start
128
+ except Exception as e:
129
+ response_body = str(e)
130
+
131
+ if r and r.status_code==200:
132
+
133
+ try:
134
+
135
+ start = time.time()
136
+ listings = parse_faa_sold_listings_page(r.text)
137
+ timeit_parsing = time.time() - start
138
+
139
+ d = dict()
140
+ d['date_utc'] = datetime.datetime.utcnow()
141
+ d['results'] = listings
142
+ d['processed']= False
143
+
144
+ status = "ok"
145
+
146
+ db_name = 'faa_scrap_' + str(random.randint(1,cnt_dbs))
147
+ col_name = 'faa_sl'
148
+
149
+ mongo_client = None
150
+ try:
151
+ start = time.time()
152
+ mongo_client = MongoClient(MONGOATLAS_URI)
153
+ db = mongo_client[db_name]
154
+ col = db[col_name]
155
+ r = col.insert_one(d)
156
+ timeit_mongo = time.time() - start
157
+ except Exception as e:
158
+ status = "error saving to mongodb ({})".format(str(e))
159
+ logging.error(status)
160
+ finally:
161
+ try:
162
+ mongo_client.close()
163
+ except Exception:
164
+ pass
165
+
166
+
167
+ o = dict()
168
+ o['site']="faa"
169
+ o['status']=status
170
+ o['date'] = d['date_utc']
171
+ o['results_count'] = len(listings)
172
+ o['db_name'] = db_name
173
+ o['timeit'] = {'request':timeit_request,
174
+ 'parsing':timeit_parsing,
175
+ 'db':timeit_mongo}
176
+ # o['proxy'] = json.dumps(proxies)
177
+
178
+ response_body = str(o)
179
+
180
+ except Exception as e:
181
+ response_body = str(e)
182
+
183
+ return PlainTextResponse(response_body, 200)
184
+
185
+
186
+ def parse_faa_sold_listings_page(html):
187
+
188
+ soup = BeautifulSoup(html , 'lxml') # "html.parser"
189
+
190
+ listings_els = soup.find_all('div' , {'class':'productImageDiv'})
191
+
192
+ listings = []
193
+
194
+ for i,listing_el in enumerate(listings_els):
195
+
196
+ #if listing_el['style'].find('hidden') > -1:
197
+ # continue
198
+
199
+ l = dict()
200
+
201
+ item_url = listing_el.find('a')['href']
202
+ if not item_url.startswith('http'):
203
+ item_url = 'https://fineartamerica.com/' + item_url
204
+
205
+ item_page = furl(item_url)
206
+ item_page.path.normalize()
207
+ l['item_page'] = item_page.url
208
+
209
+ l['image'] = listing_el.find('img' , {'class':'productImage'})['src']
210
+
211
+ artist_url = listing_el.find('p',{'class':'artistName'}).a['href']
212
+ if not artist_url.startswith('http'):
213
+ artist_url = 'https://fineartamerica.com/' + artist_url
214
+ artist_page = furl(artist_url)
215
+ artist_page.path.normalize()
216
+ l['artist_page'] = artist_page.url
217
+
218
+ l['artist'] = listing_el.find('p',{'class':'artistName'}).text
219
+ l['sell_info'] = listing_el.find('p' , {'class':'orderLocation'}).text
220
+
221
+ listings.append(l)
222
+
223
+ del soup
224
+
225
+ return listings
226
+
227
+ if __name__ == "__main__":
228
+ import uvicorn
229
+ uvicorn.run(app, host="0.0.0.0", port=7860)
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ uvicorn[standard]
2
+ fastapi
3
+ requests
4
+ pymongo
5
+ simplejson
6
+ pytz
7
+ beautifulsoup4
8
+ furl
9
+ lxml
10
+ apscheduler
11
+ fire
12
+ fake-useragent