|
|
|
""" |
|
NOTE 1: Start Command starting a FastAPI on render: |
|
@see https://community.render.com/t/fastapi-python-web-service-deploying-for-hours/6662 |
|
uvicorn app:app --host 0.0.0.0 --port 10000 |
|
|
|
|
|
""" |
|
|
|
import os , sys |
|
import datetime , requests , random , logging , time , timeit |
|
import simplejson as json |
|
from fastapi import FastAPI |
|
from fastapi.responses import PlainTextResponse , HTMLResponse , Response , JSONResponse |
|
|
|
from starlette.requests import Request |
|
|
|
from bs4 import BeautifulSoup |
|
from furl import furl |
|
|
|
|
|
from pymongo import MongoClient |
|
import fire |
|
import socket |
|
import requests |
|
|
|
from apscheduler.schedulers.background import BackgroundScheduler |
|
|
|
HOSTNAME = socket.gethostname() |
|
|
|
USER_AGENTS = [ |
|
"Mozilla/5.0 (Windows NT 6.2; WOW64; rv:63.0) Gecko/20100101 Firefox/63.0", |
|
"Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0", |
|
"Mozilla/5.0 (X11; Linux; rv:74.0) Gecko/20100101 Firefox/74.0", |
|
"Mozilla/5.0 (X11; Linux ppc64le; rv:75.0) Gecko/20100101 Firefox/75.0", |
|
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:77.0) Gecko/20190101 Firefox/77.0" |
|
] |
|
|
|
BOT_AGENTS = [ |
|
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", |
|
"Googlebot/2.1 (+http://www.googlebot.com/bot.html)", |
|
"Mozilla/5.0 (compatible; bingbot/2.0 +http://www.bing.com/bingbot.htm)", |
|
"Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)", |
|
"DuckDuckBot/1.0; (+http://duckduckgo.com/duckduckbot.html)", |
|
"Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)", |
|
"Mozilla/5.0 (compatible; Exabot/3.0; +http://www.exabot.com/go/robot)" |
|
] |
|
|
|
|
|
MONGOATLAS_URI = os.environ.get('MONGOATLAS_URI') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO , format='%(message)s') |
|
logging.getLogger("requests").setLevel(logging.ERROR) |
|
|
|
logger = logging.getLogger(__name__) |
|
logger.setLevel(logging.DEBUG) |
|
logger.propagate=False |
|
|
|
console_logger = logging.StreamHandler() |
|
console_logger.setLevel(logging.DEBUG) |
|
console_logger.setFormatter(logging.Formatter('%(message)s')) |
|
|
|
logger.addHandler(console_logger) |
|
|
|
|
|
|
|
|
|
app = FastAPI() |
|
|
|
|
|
port = 5000 |
|
scheduler = None |
|
proxies = {} |
|
|
|
if HOSTNAME == 'OCTOCORE': |
|
|
|
proxies = {'http': 'https://anonyland:[email protected]:8080', 'https': 'http://anonyland:[email protected]:8080'} |
|
proxy_ip = '192.168.1.43:80' |
|
|
|
@app.get('/') |
|
def index(): |
|
|
|
logger.info(f'hostname: {HOSTNAME}') |
|
return PlainTextResponse('OK' , 200) |
|
|
|
@app.get('/ping') |
|
def index(): |
|
return Response(status_code=200) |
|
|
|
@app.get("/remote_ip") |
|
def remote_ip(request:Request): |
|
client_host = request.client.host |
|
return PlainTextResponse(client_host , 200) |
|
|
|
@app.get("/task/faa_scrap_sold_listings_featured") |
|
def faa_scrap_sold_listings_featured_local(): |
|
|
|
global proxies |
|
|
|
timeit_request = 0 |
|
timeit_parsing = 0 |
|
timeit_mongo = 0 |
|
|
|
response_body = '?' |
|
|
|
if not MONGOATLAS_URI: |
|
return PlainTextResponse("ERROR: MONGOATLAS_URI is undefined" , status_code=500) |
|
|
|
cnt_dbs = 4 |
|
|
|
headers = { |
|
'User-Agent': random.choice(USER_AGENTS) |
|
} |
|
|
|
site_url = "https://fineartamerica.com/recentprintsales.html?rand={}".format(random.randint(1000,1000000)) |
|
r=None |
|
|
|
try: |
|
start = time.time() |
|
r = requests.get(site_url , proxies=proxies , timeout=30 , verify=False , headers=headers) |
|
timeit_request = time.time()-start |
|
except Exception as e: |
|
response_body = str(e) |
|
|
|
if r and r.status_code==200: |
|
|
|
try: |
|
|
|
start = time.time() |
|
listings = parse_faa_sold_listings_page(r.text) |
|
timeit_parsing = time.time() - start |
|
|
|
d = dict() |
|
d['date_utc'] = datetime.datetime.utcnow() |
|
d['results'] = listings |
|
d['processed']= False |
|
|
|
status = "ok" |
|
|
|
db_name = 'faa_scrap_' + str(random.randint(1,cnt_dbs)) |
|
col_name = 'faa_sl' |
|
|
|
mongo_client = None |
|
try: |
|
start = time.time() |
|
mongo_client = MongoClient(MONGOATLAS_URI) |
|
db = mongo_client[db_name] |
|
col = db[col_name] |
|
r = col.insert_one(d) |
|
timeit_mongo = time.time() - start |
|
except Exception as e: |
|
status = "error saving to mongodb ({})".format(str(e)) |
|
logging.error(status) |
|
finally: |
|
try: |
|
mongo_client.close() |
|
except Exception: |
|
pass |
|
|
|
|
|
o = dict() |
|
o['site']="faa" |
|
o['status']=status |
|
o['date'] = d['date_utc'] |
|
o['results_count'] = len(listings) |
|
o['db_name'] = db_name |
|
o['timeit'] = {'request':timeit_request, |
|
'parsing':timeit_parsing, |
|
'db':timeit_mongo} |
|
|
|
|
|
response_body = str(o) |
|
|
|
except Exception as e: |
|
response_body = str(e) |
|
|
|
return PlainTextResponse(response_body, 200) |
|
|
|
|
|
def parse_faa_sold_listings_page(html): |
|
|
|
soup = BeautifulSoup(html , 'lxml') |
|
|
|
listings_els = soup.find_all('div' , {'class':'productImageDiv'}) |
|
|
|
listings = [] |
|
|
|
for i,listing_el in enumerate(listings_els): |
|
|
|
|
|
|
|
|
|
l = dict() |
|
|
|
item_url = listing_el.find('a')['href'] |
|
if not item_url.startswith('http'): |
|
item_url = 'https://fineartamerica.com/' + item_url |
|
|
|
item_page = furl(item_url) |
|
item_page.path.normalize() |
|
l['item_page'] = item_page.url |
|
|
|
l['image'] = listing_el.find('img' , {'class':'productImage'})['src'] |
|
|
|
artist_url = listing_el.find('p',{'class':'artistName'}).a['href'] |
|
if not artist_url.startswith('http'): |
|
artist_url = 'https://fineartamerica.com/' + artist_url |
|
artist_page = furl(artist_url) |
|
artist_page.path.normalize() |
|
l['artist_page'] = artist_page.url |
|
|
|
l['artist'] = listing_el.find('p',{'class':'artistName'}).text |
|
l['sell_info'] = listing_el.find('p' , {'class':'orderLocation'}).text |
|
|
|
listings.append(l) |
|
|
|
del soup |
|
|
|
return listings |
|
|
|
if __name__ == "__main__": |
|
import uvicorn |
|
uvicorn.run(app, host="0.0.0.0", port=7860) |