AddressScrapV2

Sleeping

App Files Files Community

mattritchey commited on Feb 14

Commit

ee63323

verified ·

1 Parent(s): 1c99527

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -22

app.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import streamlit as st
 import pandas as pd
 import numpy as np
@@ -8,6 +10,7 @@ from bs4 import BeautifulSoup
 import time
 from joblib import Parallel, delayed
 from nltk import ngrams
 @st.cache_data
 def convert_df(df):
@@ -55,33 +58,44 @@ def extract_website_domain(url):
 def google_address(address):
     # address_number = re.findall(r'\b\d+\b', address)[0]
     # address_zip =re.search(r'(\d{5})$', address).group()[:2]
-    search_query = quote(address)
-    url=f'https://www.google.com/search?q={search_query}'
-    response = requests.get(url)
-    soup = BeautifulSoup(response.content, "html.parser")
-    texts_links = []
-    for link in soup.find_all("a"):
-        t,l=link.get_text(), link.get("href")
-        if (l[:11]=='/url?q=http') and (len(t)>20 ):
-            texts_links.append((t,l))
-    text = soup.get_text()
-    texts_links_des=[]
-    for i,t_l in enumerate(texts_links):
-        start=text.find(texts_links[i][0][:50])
-        try:
-            end=text.find(texts_links[i+1][0][:50])
-        except:
-            end=text.find('Related searches')
-        description=text[start:end]
-        texts_links_des.append((t_l[0],t_l[1],description))
-    df=pd.DataFrame(texts_links_des,columns=['Title','Link','Description'])
-    df['Description']=df['Description'].bfill()
     df['Address Output']=df['Title'].str.extract(r'(.+? \d{5})').fillna("**DID NOT EXTRACT ADDRESS**")
     df['Link']=[i[7:i.find('&sa=')] for i in df['Link']]

 import streamlit as st
 import pandas as pd
 import numpy as np
 import time
 from joblib import Parallel, delayed
 from nltk import ngrams
+from googlesearch import search
 @st.cache_data
 def convert_df(df):
 def google_address(address):
     # address_number = re.findall(r'\b\d+\b', address)[0]
     # address_zip =re.search(r'(\d{5})$', address).group()[:2]
+    all_data=[i for i in search(address, ssl_verify=False, advanced=True,
+                                num_results=11)]
+    # search_query = quote(address)
+    # url=f'https://www.google.com/search?q={search_query}'
+    # response = requests.get(url)
+    # soup = BeautifulSoup(response.content, "html.parser")
+    # texts_links = []
+    # for link in soup.find_all("a"):
+    #     t,l=link.get_text(), link.get("href")
+    #     if (l[:11]=='/url?q=http') and (len(t)>20 ):
+    #         texts_links.append((t,l))
+    # text = soup.get_text()
+    # texts_links_des=[]
+    # for i,t_l in enumerate(texts_links):
+    #     start=text.find(texts_links[i][0][:50])
+    #     try:
+    #         end=text.find(texts_links[i+1][0][:50])
+    #     except:
+    #         end=text.find('Related searches')
+    #     description=text[start:end]
+    #     texts_links_des.append((t_l[0],t_l[1],description))
+    # df=pd.DataFrame(texts_links_des,columns=['Title','Link','Description'])
+    df=pd.DataFrame({'Title':[i.title for i in all_data],
+                     'Link':[i.url for i in all_data],
+                     'Description':[i.description for i in all_data],})
+    df=df.query("Title==Title")
+    df['Link']=df['Link'].str.replace('/www.','www.')
+    # df['Description']=df['Description'].bfill()
     df['Address Output']=df['Title'].str.extract(r'(.+? \d{5})').fillna("**DID NOT EXTRACT ADDRESS**")
     df['Link']=[i[7:i.find('&sa=')] for i in df['Link']]