Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import pandas as pd
|
| 3 |
import numpy as np
|
|
@@ -8,6 +10,7 @@ from bs4 import BeautifulSoup
|
|
| 8 |
import time
|
| 9 |
from joblib import Parallel, delayed
|
| 10 |
from nltk import ngrams
|
|
|
|
| 11 |
|
| 12 |
@st.cache_data
|
| 13 |
def convert_df(df):
|
|
@@ -55,33 +58,44 @@ def extract_website_domain(url):
|
|
| 55 |
def google_address(address):
|
| 56 |
# address_number = re.findall(r'\b\d+\b', address)[0]
|
| 57 |
# address_zip =re.search(r'(\d{5})$', address).group()[:2]
|
|
|
|
|
|
|
| 58 |
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
-
|
| 65 |
-
for link in soup.find_all("a"):
|
| 66 |
-
t,l=link.get_text(), link.get("href")
|
| 67 |
-
if (l[:11]=='/url?q=http') and (len(t)>20 ):
|
| 68 |
-
texts_links.append((t,l))
|
| 69 |
|
| 70 |
-
|
|
|
|
|
|
|
| 71 |
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
start=text.find(texts_links[i][0][:50])
|
| 75 |
-
try:
|
| 76 |
-
end=text.find(texts_links[i+1][0][:50])
|
| 77 |
-
except:
|
| 78 |
-
end=text.find('Related searches')
|
| 79 |
-
|
| 80 |
-
description=text[start:end]
|
| 81 |
-
texts_links_des.append((t_l[0],t_l[1],description))
|
| 82 |
|
| 83 |
-
df
|
| 84 |
-
df['Description']=df['Description'].bfill()
|
| 85 |
df['Address Output']=df['Title'].str.extract(r'(.+? \d{5})').fillna("**DID NOT EXTRACT ADDRESS**")
|
| 86 |
|
| 87 |
df['Link']=[i[7:i.find('&sa=')] for i in df['Link']]
|
|
|
|
| 1 |
+
|
| 2 |
+
|
| 3 |
import streamlit as st
|
| 4 |
import pandas as pd
|
| 5 |
import numpy as np
|
|
|
|
| 10 |
import time
|
| 11 |
from joblib import Parallel, delayed
|
| 12 |
from nltk import ngrams
|
| 13 |
+
from googlesearch import search
|
| 14 |
|
| 15 |
@st.cache_data
|
| 16 |
def convert_df(df):
|
|
|
|
| 58 |
def google_address(address):
|
| 59 |
# address_number = re.findall(r'\b\d+\b', address)[0]
|
| 60 |
# address_zip =re.search(r'(\d{5})$', address).group()[:2]
|
| 61 |
+
all_data=[i for i in search(address, ssl_verify=False, advanced=True,
|
| 62 |
+
num_results=11)]
|
| 63 |
|
| 64 |
+
|
| 65 |
+
# search_query = quote(address)
|
| 66 |
+
# url=f'https://www.google.com/search?q={search_query}'
|
| 67 |
+
# response = requests.get(url)
|
| 68 |
+
# soup = BeautifulSoup(response.content, "html.parser")
|
| 69 |
+
|
| 70 |
+
# texts_links = []
|
| 71 |
+
# for link in soup.find_all("a"):
|
| 72 |
+
# t,l=link.get_text(), link.get("href")
|
| 73 |
+
# if (l[:11]=='/url?q=http') and (len(t)>20 ):
|
| 74 |
+
# texts_links.append((t,l))
|
| 75 |
+
|
| 76 |
+
# text = soup.get_text()
|
| 77 |
+
|
| 78 |
+
# texts_links_des=[]
|
| 79 |
+
# for i,t_l in enumerate(texts_links):
|
| 80 |
+
# start=text.find(texts_links[i][0][:50])
|
| 81 |
+
# try:
|
| 82 |
+
# end=text.find(texts_links[i+1][0][:50])
|
| 83 |
+
# except:
|
| 84 |
+
# end=text.find('Related searches')
|
| 85 |
+
|
| 86 |
+
# description=text[start:end]
|
| 87 |
+
# texts_links_des.append((t_l[0],t_l[1],description))
|
| 88 |
|
| 89 |
+
# df=pd.DataFrame(texts_links_des,columns=['Title','Link','Description'])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
+
df=pd.DataFrame({'Title':[i.title for i in all_data],
|
| 92 |
+
'Link':[i.url for i in all_data],
|
| 93 |
+
'Description':[i.description for i in all_data],})
|
| 94 |
|
| 95 |
+
df=df.query("Title==Title")
|
| 96 |
+
df['Link']=df['Link'].str.replace('/www.','www.')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
+
# df['Description']=df['Description'].bfill()
|
|
|
|
| 99 |
df['Address Output']=df['Title'].str.extract(r'(.+? \d{5})').fillna("**DID NOT EXTRACT ADDRESS**")
|
| 100 |
|
| 101 |
df['Link']=[i[7:i.find('&sa=')] for i in df['Link']]
|