Spaces:
Build error
Build error
georgiad
commited on
Commit
·
f0bc6d4
1
Parent(s):
8f98272
Switched to single feed
Browse files- newsletter_production_mercury.ipynb +436 -485
- vc_newsletter_results.xlsx +0 -0
newsletter_production_mercury.ipynb
CHANGED
|
@@ -1,493 +1,444 @@
|
|
| 1 |
{
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
{
|
| 75 |
-
|
| 76 |
-
"
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
]
|
| 79 |
-
},
|
| 80 |
-
"execution_count": 2,
|
| 81 |
-
"metadata": {},
|
| 82 |
-
"output_type": "execute_result"
|
| 83 |
}
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
"
|
| 87 |
-
" \"Batteries \",\n",
|
| 88 |
-
" \"Battery Management systems\",\n",
|
| 89 |
-
" \"Lidars\",\n",
|
| 90 |
-
" \"RADARS\",\n",
|
| 91 |
-
" \"AI\",\n",
|
| 92 |
-
" \"Industrial AI\",\n",
|
| 93 |
-
" \"Transportation\",\n",
|
| 94 |
-
" \"Mobility\",\n",
|
| 95 |
-
" \"Climate Tech\",\n",
|
| 96 |
-
" \"Sustainable grid\",\n",
|
| 97 |
-
" \"Sensor fusion\",\n",
|
| 98 |
-
" \"Computer vision\",\n",
|
| 99 |
-
" \"Data Analytics\",\n",
|
| 100 |
-
" \"Digital Twins\",\n",
|
| 101 |
-
" \"Automotive Cybersecurity\",\n",
|
| 102 |
-
" \"Logistics\",\n",
|
| 103 |
-
" \"Ports\",\n",
|
| 104 |
-
" \"Construction sites\",\n",
|
| 105 |
-
" \"Mines\",\n",
|
| 106 |
-
" \"Quarries\",\n",
|
| 107 |
-
" \"Trucks\",\n",
|
| 108 |
-
" \"Power train\",\n",
|
| 109 |
-
" \"Software defined vehicle\"]\n",
|
| 110 |
-
"'''\n",
|
| 111 |
-
"feed_dict = {'a16z.com/': 'https://rssbox.us-west-2.elasticbeanstalk.com/twitter/64844802/a16z',\n",
|
| 112 |
-
" 'sequoiacap.com/article': 'https://rssbox.us-west-2.elasticbeanstalk.com/twitter/24775410/sequoia',\n",
|
| 113 |
-
" 'zettavp.com/playbook/': 'https://rssbox.us-west-2.elasticbeanstalk.com/twitter/3952740738/ZettaVentures',\n",
|
| 114 |
-
" 'atomico.com/insights/': 'https://rssbox.us-west-2.elasticbeanstalk.com/twitter/44579473/atomico',\n",
|
| 115 |
-
" 'nt-z.ro/': 'https://rssbox.us-west-2.elasticbeanstalk.com/twitter/793585473956425728/Breakthrough',\n",
|
| 116 |
-
" 'accel.com/noteworthy': 'https://rssbox.us-west-2.elasticbeanstalk.com/twitter/33846451/Accel',\n",
|
| 117 |
-
" 'felicis.com/': 'https://rssbox.us-west-2.elasticbeanstalk.com/twitter/382162909/felicis',\n",
|
| 118 |
-
" 'scalevp.com/blog/': 'https://rssbox.us-west-2.elasticbeanstalk.com/twitter/17904605/scalevp',\n",
|
| 119 |
-
" 'redpoint.com/start/': 'https://rssbox.us-west-2.elasticbeanstalk.com/twitter/104286459/Redpoint',\n",
|
| 120 |
-
" '83north.com/': 'https://rssbox.us-west-2.elasticbeanstalk.com/twitter/2963095189/83NorthVC',\n",
|
| 121 |
-
" 'bvp.com/atlas/': 'https://rssbox.us-west-2.elasticbeanstalk.com/twitter/59166197/BessemerVP'}\n",
|
| 122 |
-
"'''\n",
|
| 123 |
-
"\n",
|
| 124 |
-
"feed_dict = {'a16z.com/': 'https://nitter.kavin.rocks/a16z/rss'}\n",
|
| 125 |
-
"''',\n",
|
| 126 |
-
" 'sequoiacap.com/article': 'https://nitter.kavin.rocks/sequoia/rss',\n",
|
| 127 |
-
" 'zettavp.com/playbook/': 'https://nitter.kavin.rocks/ZettaVentures/rss',\n",
|
| 128 |
-
" 'atomico.com/insights/': 'https://nitter.kavin.rocks/atomico/rss',\n",
|
| 129 |
-
" 'nt-z.ro/': 'https://nitter.kavin.rocks/Breakthrough/rss',\n",
|
| 130 |
-
" 'accel.com/noteworthy': 'https://nitter.kavin.rocks/Accel/rss',\n",
|
| 131 |
-
" 'felicis.com/': 'https://nitter.kavin.rocks/felicis/rss',\n",
|
| 132 |
-
" 'scalevp.com/blog/': 'https://nitter.kavin.rocks/scalevp/rss',\n",
|
| 133 |
-
" 'redpoint.com/start/': 'https://nitter.kavin.rocks/Redpoint/rss',\n",
|
| 134 |
-
" '83north.com/': 'https://nitter.kavin.rocks/83NorthVC/rss',\n",
|
| 135 |
-
" 'bvp.com/atlas/': 'https://nitter.kavin.rocks/BessemerVP/rss'}\n",
|
| 136 |
-
"'''"
|
| 137 |
-
]
|
| 138 |
-
},
|
| 139 |
-
{
|
| 140 |
-
"cell_type": "code",
|
| 141 |
-
"execution_count": 3,
|
| 142 |
-
"metadata": {
|
| 143 |
-
"id": "Ig5nSCbI6yuL"
|
| 144 |
-
},
|
| 145 |
-
"outputs": [],
|
| 146 |
-
"source": [
|
| 147 |
-
"from keybert import KeyBERT\n",
|
| 148 |
-
"import pandas as pd\n",
|
| 149 |
-
"from keyphrase_vectorizers import KeyphraseCountVectorizer\n",
|
| 150 |
-
"from sentence_transformers import SentenceTransformer\n",
|
| 151 |
-
"import numpy as np\n",
|
| 152 |
-
"from sklearn.metrics.pairwise import cosine_similarity\n",
|
| 153 |
-
"\n",
|
| 154 |
-
"import feedparser\n",
|
| 155 |
-
"import requests\n",
|
| 156 |
-
"from bs4 import BeautifulSoup\n",
|
| 157 |
-
"from openpyxl import Workbook\n",
|
| 158 |
-
"import time\n",
|
| 159 |
-
"import pickle\n",
|
| 160 |
-
"import os\n",
|
| 161 |
-
"from tqdm import tqdm\n",
|
| 162 |
-
"from concurrent.futures import ThreadPoolExecutor\n",
|
| 163 |
-
"#from functools import lru_cache\n",
|
| 164 |
-
"\n",
|
| 165 |
-
"# Define function to extract keywords from the HTML body using the YAKE keyword extractor\n",
|
| 166 |
-
"def extract_keyphrases(text, kw_model, vectorizer, embedding_model):\n",
|
| 167 |
-
" kph = [kw for kw, score in kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english', vectorizer=vectorizer, use_mmr=True)]\n",
|
| 168 |
-
" keyphrase_embeddings = embedding_model.encode(kph)\n",
|
| 169 |
-
" return kph, keyphrase_embeddings\n",
|
| 170 |
-
"\n",
|
| 171 |
-
"def get_similarity_scores(keyword_embeddings, keyphrase_embeddings):\n",
|
| 172 |
-
" similarity_scores = cosine_similarity(keyphrase_embeddings, keyword_embeddings).max(axis=1).astype(str).tolist()\n",
|
| 173 |
-
" similarity_max = cosine_similarity(keyphrase_embeddings, keyword_embeddings).flatten().max().astype(str)\n",
|
| 174 |
-
" return similarity_scores, similarity_max\n",
|
| 175 |
-
"\n",
|
| 176 |
-
"# Define function to get the redirected URL (if any) for a given URL\n",
|
| 177 |
-
"def get_redirected_url(url_record, headers, expected_codes=(301, 302, 303, 307), timeout=60):\n",
|
| 178 |
-
" try:\n",
|
| 179 |
-
" res = requests.head(url_record['url'], headers=headers, timeout=timeout)\n",
|
| 180 |
-
" if res.status_code in expected_codes:\n",
|
| 181 |
-
" url_record['url'] = res.headers['location']\n",
|
| 182 |
-
" elif res.status_code == 200:\n",
|
| 183 |
-
" url_record['url'] = url_record['url']\n",
|
| 184 |
-
" else:\n",
|
| 185 |
-
" print(f\"Retrieving {url_record['url']} failed: Expected {expected_codes}, but received {res.status_code}: {res.reason}\")\n",
|
| 186 |
-
" except requests.exceptions.Timeout:\n",
|
| 187 |
-
" print(f\"\\nRequest timed out for {url_record['url']}\")\n",
|
| 188 |
-
" return url_record\n",
|
| 189 |
-
" except:\n",
|
| 190 |
-
" return url_record\n",
|
| 191 |
-
"\n",
|
| 192 |
-
" return url_record\n",
|
| 193 |
-
"\n",
|
| 194 |
-
"# Define function to get the HTML body of a given URL\n",
|
| 195 |
-
"def get_html_body(url, headers):\n",
|
| 196 |
-
" try:\n",
|
| 197 |
-
" response = requests.get(url, headers=headers, timeout=10)\n",
|
| 198 |
-
" html = response.content\n",
|
| 199 |
-
" soup = BeautifulSoup(html, 'html.parser')\n",
|
| 200 |
-
" return soup.body.get_text()\n",
|
| 201 |
-
" except:\n",
|
| 202 |
-
" return ''\n",
|
| 203 |
-
"\n",
|
| 204 |
-
"# Define function to write data to the Excel sheet\n",
|
| 205 |
-
"def write_data_to_excel(url_dict, filename):\n",
|
| 206 |
-
" # Create a new Excel workbook and worksheet\n",
|
| 207 |
-
" workbook = Workbook()\n",
|
| 208 |
-
" worksheet = workbook.active\n",
|
| 209 |
-
" worksheet.title = 'RSS Feeds'\n",
|
| 210 |
-
"\n",
|
| 211 |
-
" # Write the headers for the Excel sheet\n",
|
| 212 |
-
" worksheet.cell(row=1, column=1, value='Feed Name')\n",
|
| 213 |
-
" worksheet.cell(row=1, column=2, value='URL')\n",
|
| 214 |
-
" worksheet.cell(row=1, column=3, value='Updated')\n",
|
| 215 |
-
" worksheet.cell(row=1, column=4, value='Keyphrases')\n",
|
| 216 |
-
" worksheet.cell(row=1, column=5, value='Similarity to supplied keywords')\n",
|
| 217 |
-
" worksheet.cell(row=1, column=6, value='Similarity (max)')\n",
|
| 218 |
-
" worksheet.cell(row=1, column=7, value='HTML Body')\n",
|
| 219 |
-
"\n",
|
| 220 |
-
" # Loop over the unique URLs and write them to the Excel sheet\n",
|
| 221 |
-
" row_num = 2\n",
|
| 222 |
-
" for url, data in url_dict.items():\n",
|
| 223 |
-
" worksheet.cell(row=row_num, column=1, value=data['feed_name'])\n",
|
| 224 |
-
" worksheet.cell(row=row_num, column=2, value=url)\n",
|
| 225 |
-
" worksheet.cell(row=row_num, column=3, value=data['updated'])\n",
|
| 226 |
-
" worksheet.cell(row=row_num, column=4, value=data['keyphrases'])\n",
|
| 227 |
-
" worksheet.cell(row=row_num, column=5, value=data['similarity'])\n",
|
| 228 |
-
" worksheet.cell(row=row_num, column=6, value=data['similarity_max'])\n",
|
| 229 |
-
" worksheet.cell(row=row_num, column=7, value=data['html_body'])\n",
|
| 230 |
-
"\n",
|
| 231 |
-
" row_num += 1\n",
|
| 232 |
-
"\n",
|
| 233 |
-
" worksheet.freeze_panes = 'A2'\n",
|
| 234 |
-
"\n",
|
| 235 |
-
" # Set the number format for column A, except the first row\n",
|
| 236 |
-
" for row in worksheet.iter_rows(min_row=2, min_col=3, max_col=3):\n",
|
| 237 |
-
" for cell in row:\n",
|
| 238 |
-
" cell.number_format = 'mm/dd/yyyy hh:mm:ss'\n",
|
| 239 |
-
"\n",
|
| 240 |
-
" # Save the Excel workbook\n",
|
| 241 |
-
" workbook.save(filename)\n",
|
| 242 |
-
"\n",
|
| 243 |
-
" # Print confirmation message\n",
|
| 244 |
-
" #print(f'RSS output written to excel sheet: {filename}')\n",
|
| 245 |
-
"\n",
|
| 246 |
-
"def remaining_entries_from_dict(filename, dictionary):\n",
|
| 247 |
-
" pickle_data = {}\n",
|
| 248 |
-
" if os.path.exists(filename):\n",
|
| 249 |
-
" with open(filename, 'rb') as f:\n",
|
| 250 |
-
" pickle_data = pickle.load(f)\n",
|
| 251 |
-
" return list(set(dictionary.keys()) - set(pickle_data.keys()))\n",
|
| 252 |
-
"\n",
|
| 253 |
-
"def process_url(url):\n",
|
| 254 |
-
" global url_dict\n",
|
| 255 |
-
" \n",
|
| 256 |
-
" #body = get_html_body(url, headers)\n",
|
| 257 |
-
" #kph,keyphrase_embeddings = extract_keyphrases(body, kw_model, vectorizer, embedding_model)\n",
|
| 258 |
-
" #similarity, similarity_max = get_similarity_scores(keyword_embeddings, keyphrase_embeddings)\n",
|
| 259 |
-
"\n",
|
| 260 |
-
" #url_dict[url]['keyphrases'] = ', '.join(kph)\n",
|
| 261 |
-
" #url_dict[url]['similarity'] = ', '.join(similarity)\n",
|
| 262 |
-
" #url_dict[url]['similarity_max'] = similarity_max\n",
|
| 263 |
-
" #url_dict[url]['html_body'] = body\n",
|
| 264 |
-
" \n",
|
| 265 |
-
" url_dict[url]['keyphrases'] = ''\n",
|
| 266 |
-
" url_dict[url]['similarity'] = ''\n",
|
| 267 |
-
" url_dict[url]['similarity_max'] = ''\n",
|
| 268 |
-
" url_dict[url]['html_body'] = \"Skipping this part, to speed up the process\"\n",
|
| 269 |
-
"\n",
|
| 270 |
-
" # Store temporary results to disk\n",
|
| 271 |
-
" #with open(\"retrieved_urls.pkl\", 'wb') as f:\n",
|
| 272 |
-
" # pickle.dump(url_dict, f)"
|
| 273 |
-
]
|
| 274 |
-
},
|
| 275 |
-
{
|
| 276 |
-
"cell_type": "code",
|
| 277 |
-
"execution_count": null,
|
| 278 |
-
"metadata": {
|
| 279 |
"colab": {
|
| 280 |
-
|
| 281 |
},
|
| 282 |
-
"
|
| 283 |
-
"
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
"import pprint\n",
|
| 288 |
-
"from concurrent.futures import ThreadPoolExecutor, as_completed\n",
|
| 289 |
-
"from tqdm import tqdm\n",
|
| 290 |
-
"from datetime import datetime\n",
|
| 291 |
-
"import nltk\n",
|
| 292 |
-
"\n",
|
| 293 |
-
"\n",
|
| 294 |
-
"# Start the profiling timer\n",
|
| 295 |
-
"start_time = time.time()\n",
|
| 296 |
-
"\n",
|
| 297 |
-
"# Initialize the SentenceTransformer model\n",
|
| 298 |
-
"kw_model = KeyBERT('distilbert-base-nli-mean-tokens')\n",
|
| 299 |
-
"vectorizer = KeyphraseCountVectorizer()\n",
|
| 300 |
-
"embedding_model = SentenceTransformer('distilbert-base-nli-mean-tokens')\n",
|
| 301 |
-
"nltk.download('stopwords', quiet=True)\n",
|
| 302 |
-
"\n",
|
| 303 |
-
"# Initialize variables\n",
|
| 304 |
-
"headers = {\n",
|
| 305 |
-
" 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'\n",
|
| 306 |
-
"}\n",
|
| 307 |
-
"keyword_embeddings = embedding_model.encode(keywords) # Encode keywords using the embedding model\n",
|
| 308 |
-
"\n",
|
| 309 |
-
"def read_feeds(feed_list, how_old):\n",
|
| 310 |
-
" global urls\n",
|
| 311 |
-
" import sys\n",
|
| 312 |
-
" import io\n",
|
| 313 |
-
" from datetime import datetime, timedelta\n",
|
| 314 |
-
"\n",
|
| 315 |
-
" old_stdout = sys.stdout\n",
|
| 316 |
-
" sys.stdout = mystdout = io.StringIO()\n",
|
| 317 |
-
" print(\"A small change\")\n",
|
| 318 |
-
" # Loop over the RSS feeds and keywords\n",
|
| 319 |
-
" urls_temp = []\n",
|
| 320 |
-
" urls = []\n",
|
| 321 |
-
" feed_item_age_minimum = datetime.now() - timedelta(days=int(how_old.split()[0]))\n",
|
| 322 |
-
"\n",
|
| 323 |
-
" for keyword, rss_feed in tqdm(feed_list.items(), total=len(feed_list.items()), file=sys.stdout, bar_format='Reading feeds: {n}/{total} ({percentage:.0f}%), time elapsed: {elapsed}'):\n",
|
| 324 |
-
" print(rss_feed)\n",
|
| 325 |
-
" feed_name = rss_feed.split('/')[-1]\n",
|
| 326 |
-
" feed = feedparser.parse(rss_feed)\n",
|
| 327 |
-
" for entry in tqdm(feed.entries, total=len(feed.entries), file=sys.stdout, bar_format='\\tReading feed entries: {n}/{total} ({percentage:.0f}%), time elapsed: {elapsed}'):\n",
|
| 328 |
-
" if 'content' in entry:\n",
|
| 329 |
-
" for content in entry.content:\n",
|
| 330 |
-
" soup = BeautifulSoup(content.get('value'), 'html.parser')\n",
|
| 331 |
-
" updated = datetime.strptime(entry.updated, '%Y-%m-%dT%H:%M:%SZ')\n",
|
| 332 |
-
" if updated > feed_item_age_minimum:\n",
|
| 333 |
-
" urls_temp.extend([{'url': link.get('href'), 'updated': updated, 'feed_name': feed_name} for link in soup.find_all('a')])\n",
|
| 334 |
-
"\n",
|
| 335 |
-
" with ThreadPoolExecutor(max_workers=4) as executor:\n",
|
| 336 |
-
" futures = [executor.submit(get_redirected_url, url, headers) for url in urls_temp]\n",
|
| 337 |
-
" for future in tqdm(as_completed(futures), total=len(futures), file=sys.stdout, bar_format='Checking URLs: {n}/{total} ({percentage:.0f}%), time elapsed: {elapsed}'):\n",
|
| 338 |
-
" urls.append(future.result())\n",
|
| 339 |
-
"\n",
|
| 340 |
-
" sys.stdout = old_stdout\n",
|
| 341 |
-
" return mystdout.getvalue()\n",
|
| 342 |
-
"\n",
|
| 343 |
-
"def read_process_urls():\n",
|
| 344 |
-
" import sys\n",
|
| 345 |
-
" import io\n",
|
| 346 |
-
" from datetime import datetime, timedelta\n",
|
| 347 |
-
" old_stdout = sys.stdout\n",
|
| 348 |
-
" sys.stdout = mystdout = io.StringIO()\n",
|
| 349 |
-
"\n",
|
| 350 |
-
" global urls\n",
|
| 351 |
-
" global url_dict\n",
|
| 352 |
-
" kw_for = {rss_feed.split('/')[-1]: keyword for keyword, rss_feed in feed_dict.items()}\n",
|
| 353 |
-
"\n",
|
| 354 |
-
" #print(f\"Urls: {urls}\")\n",
|
| 355 |
-
" url_dict = {}\n",
|
| 356 |
-
" for item in urls:\n",
|
| 357 |
-
" feed_name = item['feed_name']\n",
|
| 358 |
-
" updated = item['updated']\n",
|
| 359 |
-
" url = item['url']\n",
|
| 360 |
-
"\n",
|
| 361 |
-
" import pprint\n",
|
| 362 |
-
" pprint.pprint(url)\n",
|
| 363 |
-
" pprint.pprint(kw_for[feed_name])\n",
|
| 364 |
-
" if kw_for[feed_name] in url:\n",
|
| 365 |
-
" if url not in url_dict.keys():\n",
|
| 366 |
-
" url_dict[url] = {'updated': updated, 'feed_name': feed_name}\n",
|
| 367 |
-
" else:\n",
|
| 368 |
-
" if url_dict[url]['updated'] > updated:\n",
|
| 369 |
-
" url_dict[url]['updated'] = updated\n",
|
| 370 |
-
"\n",
|
| 371 |
-
" start_parallel_loop_time = time.time()\n",
|
| 372 |
-
" results = []\n",
|
| 373 |
-
" with ThreadPoolExecutor(max_workers=4) as executor:\n",
|
| 374 |
-
" futures = [executor.submit(process_url, url) for url in url_dict.keys()]#remaining_entries_from_dict(\"retrieved_urls.pkl\", url_dict)]\n",
|
| 375 |
-
" for future in tqdm(as_completed(futures), total=len(futures), file=sys.stdout, bar_format='Reading URLs: {n}/{total} ({percentage:.0f}%), time elapsed: {elapsed}'):\n",
|
| 376 |
-
" results.append(future.result())\n",
|
| 377 |
-
" #print(f\"Parallel URL processing: {time.time() - start_parallel_loop_time:.3f} seconds\")\n",
|
| 378 |
-
" print(f\"Total links processed: {len(url_dict.keys())}\")\n",
|
| 379 |
-
"\n",
|
| 380 |
-
" #with open(\"retrieved_urls.pkl\", 'wb') as f:\n",
|
| 381 |
-
" # pickle.dump(url_dict, f)\n",
|
| 382 |
-
"\n",
|
| 383 |
-
" # Print the time taken by the main function\n",
|
| 384 |
-
" print(f\"Total time needed: {time.time() - start_time:.3f} seconds\")\n",
|
| 385 |
-
"\n",
|
| 386 |
-
" # Write dataset to the Excel sheet\n",
|
| 387 |
-
" write_data_to_excel(url_dict, 'newsletter_results.xlsx')\n",
|
| 388 |
-
"\n",
|
| 389 |
-
" sys.stdout = old_stdout\n",
|
| 390 |
-
" return mystdout.getvalue()"
|
| 391 |
-
]
|
| 392 |
-
},
|
| 393 |
-
{
|
| 394 |
-
"cell_type": "code",
|
| 395 |
-
"execution_count": null,
|
| 396 |
-
"metadata": {
|
| 397 |
-
"colab": {
|
| 398 |
-
"base_uri": "https://localhost:8080/",
|
| 399 |
-
"height": 622,
|
| 400 |
-
"referenced_widgets": [
|
| 401 |
-
"55c86791cb9a491c8bb6217af108fa16",
|
| 402 |
-
"63d6badce4dc45a0a4af8660c9a96b5d",
|
| 403 |
-
"733bbc713dea4917bcdde749cafbd99c",
|
| 404 |
-
"975eed9f527e441c99ff509b5cc53bc1",
|
| 405 |
-
"3ef1bf00fdec434ebfe32715777368ff",
|
| 406 |
-
"cf14a6e4b11840bdae7166460ea8485f",
|
| 407 |
-
"1991f11a746b4c4ba005ae4eee9ae0ad",
|
| 408 |
-
"4835d51282334101ac67c821bd5d628c",
|
| 409 |
-
"106ead77e8064e65844b810e3f5d574d",
|
| 410 |
-
"6537a1a32eab44b08900a48a6f005dd7",
|
| 411 |
-
"d66e01f3991c4ec69e3e2f9d204c3898",
|
| 412 |
-
"aa70b01aa66147f592652a30b890c888",
|
| 413 |
-
"ff0115a2cae843c1b164e4b7d179ff24",
|
| 414 |
-
"c52d63ae33c14fe9befe4debb4f7ab21",
|
| 415 |
-
"eeb751b9682144ca90b1d467a6f525aa",
|
| 416 |
-
"7347a703f72c4d459217afb92c3c05b1",
|
| 417 |
-
"583f99b9cfd945a4afaede6cfa0961db",
|
| 418 |
-
"c22b236a142144d3a0fdae9163a795d3",
|
| 419 |
-
"93719585652c49cc8c32455577423532",
|
| 420 |
-
"b30da31352da4278b8180b7f2c4080fa",
|
| 421 |
-
"7ed878a7f4df4e52989d8e16c542842b",
|
| 422 |
-
"2e7d9d0f096a4b5592ab3116766c9158",
|
| 423 |
-
"03448be097a84dd6bc0c5064166b22d0",
|
| 424 |
-
"938b6fc3ee6a4cbd95196a6e5ab111d9",
|
| 425 |
-
"5728cc854a2a4c6fa037b78889dedb20",
|
| 426 |
-
"1d8c3034625f4b598a1875b74f7efdfd",
|
| 427 |
-
"0cc60859ebd7478fad11cb51113da820",
|
| 428 |
-
"1bb65fbc92144355a0977d88d6c3bf50",
|
| 429 |
-
"81ca37de802f427791e4d0f740735d0a",
|
| 430 |
-
"a27f0a049a5b4d869d1297afac81099f",
|
| 431 |
-
"bfa325bddbec4f98bd60e436461db0af",
|
| 432 |
-
"27a8d7b324ba43798be54e7685eb345a",
|
| 433 |
-
"8db6fe180d234c7d9d60cadcfadc5bc7",
|
| 434 |
-
"8e4869a380aa4f5faa05335c16d6e972",
|
| 435 |
-
"997639d0dcc8461a89fb9bcdcfffacbe"
|
| 436 |
-
]
|
| 437 |
},
|
| 438 |
-
"
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
"display(HTML(f\"------------------------------\"))\n",
|
| 451 |
-
"\n",
|
| 452 |
-
"for url in url_dict.keys():\n",
|
| 453 |
-
" #print(url)\n",
|
| 454 |
-
" display(HTML(f\"{url}\"))\n"
|
| 455 |
-
]
|
| 456 |
-
},
|
| 457 |
-
{
|
| 458 |
-
"cell_type": "code",
|
| 459 |
-
"execution_count": null,
|
| 460 |
-
"metadata": {},
|
| 461 |
-
"outputs": [],
|
| 462 |
-
"source": [
|
| 463 |
-
"urls"
|
| 464 |
-
]
|
| 465 |
-
}
|
| 466 |
-
],
|
| 467 |
-
"metadata": {
|
| 468 |
-
"accelerator": "GPU",
|
| 469 |
-
"colab": {
|
| 470 |
-
"provenance": []
|
| 471 |
-
},
|
| 472 |
-
"gpuClass": "standard",
|
| 473 |
-
"kernelspec": {
|
| 474 |
-
"display_name": "Python 3 (ipykernel)",
|
| 475 |
-
"language": "python",
|
| 476 |
-
"name": "python3"
|
| 477 |
},
|
| 478 |
-
"
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
"version": 3
|
| 482 |
-
},
|
| 483 |
-
"file_extension": ".py",
|
| 484 |
-
"mimetype": "text/x-python",
|
| 485 |
-
"name": "python",
|
| 486 |
-
"nbconvert_exporter": "python",
|
| 487 |
-
"pygments_lexer": "ipython3",
|
| 488 |
-
"version": "3.9.7"
|
| 489 |
-
}
|
| 490 |
-
},
|
| 491 |
-
"nbformat": 4,
|
| 492 |
-
"nbformat_minor": 1
|
| 493 |
-
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "raw",
|
| 5 |
+
"metadata": {
|
| 6 |
+
"id": "uIxcPJeuGGAF"
|
| 7 |
+
},
|
| 8 |
+
"source": [
|
| 9 |
+
"---\n",
|
| 10 |
+
"title: Newsletter Helper\n",
|
| 11 |
+
"description: Follow the instructions on screen\n",
|
| 12 |
+
"show-code: false\n",
|
| 13 |
+
"params:\n",
|
| 14 |
+
" feed_keywords:\n",
|
| 15 |
+
" label: Sources\n",
|
| 16 |
+
" input: select\n",
|
| 17 |
+
" value: ['a16z.com/',\n",
|
| 18 |
+
" 'sequoiacap.com/article',\n",
|
| 19 |
+
" 'zettavp.com/playbook/',\n",
|
| 20 |
+
" 'atomico.com/insights/',\n",
|
| 21 |
+
" 'nt-z.ro/',\n",
|
| 22 |
+
" 'accel.com/noteworthy',\n",
|
| 23 |
+
" 'felicis.com/',\n",
|
| 24 |
+
" 'scalevp.com/blog/',\n",
|
| 25 |
+
" 'redpoint.com/start/',\n",
|
| 26 |
+
" '83north.com/',\n",
|
| 27 |
+
" 'bvp.com/atlas/']\n",
|
| 28 |
+
" choices: ['a16z.com/',\n",
|
| 29 |
+
" 'sequoiacap.com/article',\n",
|
| 30 |
+
" 'zettavp.com/playbook/',\n",
|
| 31 |
+
" 'atomico.com/insights/',\n",
|
| 32 |
+
" 'nt-z.ro/',\n",
|
| 33 |
+
" 'accel.com/noteworthy',\n",
|
| 34 |
+
" 'felicis.com/',\n",
|
| 35 |
+
" 'scalevp.com/blog/',\n",
|
| 36 |
+
" 'redpoint.com/start/',\n",
|
| 37 |
+
" '83north.com/',\n",
|
| 38 |
+
" 'bvp.com/atlas/']\n",
|
| 39 |
+
" multi: True\n",
|
| 40 |
+
" feed_age:\n",
|
| 41 |
+
" label: How old?\n",
|
| 42 |
+
" input: select\n",
|
| 43 |
+
" value: '7 days'\n",
|
| 44 |
+
" choices: ['7 days', '14 days', '30 days']\n",
|
| 45 |
+
" multi: False\n",
|
| 46 |
+
"---"
|
| 47 |
+
]
|
| 48 |
+
},
|
| 49 |
+
{
|
| 50 |
+
"cell_type": "code",
|
| 51 |
+
"execution_count": 2,
|
| 52 |
+
"metadata": {
|
| 53 |
+
"id": "pfJ5NpqjCT1U"
|
| 54 |
+
},
|
| 55 |
+
"outputs": [],
|
| 56 |
+
"source": [
|
| 57 |
+
"feed_keywords = ['a16z.com/',\n",
|
| 58 |
+
" 'sequoiacap.com/article',\n",
|
| 59 |
+
" 'zettavp.com/playbook/',\n",
|
| 60 |
+
" 'atomico.com/insights/',\n",
|
| 61 |
+
" 'nt-z.ro/',\n",
|
| 62 |
+
" 'accel.com/noteworthy',\n",
|
| 63 |
+
" 'felicis.com/',\n",
|
| 64 |
+
" 'scalevp.com/blog/',\n",
|
| 65 |
+
" 'redpoint.com/start/',\n",
|
| 66 |
+
" '83north.com/',\n",
|
| 67 |
+
" 'bvp.com/atlas/']\n",
|
| 68 |
+
"feed_age = '28 days'"
|
| 69 |
+
]
|
| 70 |
+
},
|
| 71 |
+
{
|
| 72 |
+
"cell_type": "code",
|
| 73 |
+
"execution_count": 3,
|
| 74 |
+
"metadata": {
|
| 75 |
+
"id": "mEOS4asyGGAI"
|
| 76 |
+
},
|
| 77 |
+
"outputs": [],
|
| 78 |
+
"source": [
|
| 79 |
+
"keywords = [\"Electro mobility\",\n",
|
| 80 |
+
" \"Batteries \",\n",
|
| 81 |
+
" \"Battery Management systems\",\n",
|
| 82 |
+
" \"Lidars\",\n",
|
| 83 |
+
" \"RADARS\",\n",
|
| 84 |
+
" \"AI\",\n",
|
| 85 |
+
" \"Industrial AI\",\n",
|
| 86 |
+
" \"Transportation\",\n",
|
| 87 |
+
" \"Mobility\",\n",
|
| 88 |
+
" \"Climate Tech\",\n",
|
| 89 |
+
" \"Sustainable grid\",\n",
|
| 90 |
+
" \"Sensor fusion\",\n",
|
| 91 |
+
" \"Computer vision\",\n",
|
| 92 |
+
" \"Data Analytics\",\n",
|
| 93 |
+
" \"Digital Twins\",\n",
|
| 94 |
+
" \"Automotive Cybersecurity\",\n",
|
| 95 |
+
" \"Logistics\",\n",
|
| 96 |
+
" \"Ports\",\n",
|
| 97 |
+
" \"Construction sites\",\n",
|
| 98 |
+
" \"Mines\",\n",
|
| 99 |
+
" \"Quarries\",\n",
|
| 100 |
+
" \"Trucks\",\n",
|
| 101 |
+
" \"Power train\",\n",
|
| 102 |
+
" \"Software defined vehicle\"]\n",
|
| 103 |
+
"\n",
|
| 104 |
+
"feed = \"https://www.rssground.com/p/Newsletter\""
|
| 105 |
+
]
|
| 106 |
+
},
|
| 107 |
+
{
|
| 108 |
+
"cell_type": "code",
|
| 109 |
+
"source": [
|
| 110 |
+
"#!pip install keybert\n",
|
| 111 |
+
"#!pip install feedparser\n",
|
| 112 |
+
"#!pip install keyphrase_vectorizers\n",
|
| 113 |
+
"#!pip install sentence-transformers"
|
| 114 |
+
],
|
| 115 |
+
"metadata": {
|
| 116 |
+
"id": "WMswc6FCGR9T"
|
| 117 |
+
},
|
| 118 |
+
"execution_count": 11,
|
| 119 |
+
"outputs": []
|
| 120 |
+
},
|
| 121 |
{
|
| 122 |
+
"cell_type": "code",
|
| 123 |
+
"execution_count": 5,
|
| 124 |
+
"metadata": {
|
| 125 |
+
"id": "Ig5nSCbI6yuL"
|
| 126 |
+
},
|
| 127 |
+
"outputs": [],
|
| 128 |
+
"source": [
|
| 129 |
+
"from keybert import KeyBERT\n",
|
| 130 |
+
"import pandas as pd\n",
|
| 131 |
+
"from keyphrase_vectorizers import KeyphraseCountVectorizer\n",
|
| 132 |
+
"from sentence_transformers import SentenceTransformer\n",
|
| 133 |
+
"import numpy as np\n",
|
| 134 |
+
"from sklearn.metrics.pairwise import cosine_similarity\n",
|
| 135 |
+
"\n",
|
| 136 |
+
"import feedparser\n",
|
| 137 |
+
"import requests\n",
|
| 138 |
+
"from bs4 import BeautifulSoup\n",
|
| 139 |
+
"from openpyxl import Workbook\n",
|
| 140 |
+
"import time\n",
|
| 141 |
+
"import pickle\n",
|
| 142 |
+
"import os\n",
|
| 143 |
+
"from tqdm import tqdm\n",
|
| 144 |
+
"from concurrent.futures import ThreadPoolExecutor\n",
|
| 145 |
+
"#from functools import lru_cache\n",
|
| 146 |
+
"\n",
|
| 147 |
+
"# Define function to extract keywords from the HTML body using the YAKE keyword extractor\n",
|
| 148 |
+
"def extract_keyphrases(text, kw_model, vectorizer, embedding_model):\n",
|
| 149 |
+
" kph = [kw for kw, score in kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english', vectorizer=vectorizer, use_mmr=True)]\n",
|
| 150 |
+
" keyphrase_embeddings = embedding_model.encode(kph)\n",
|
| 151 |
+
" return kph, keyphrase_embeddings\n",
|
| 152 |
+
"\n",
|
| 153 |
+
"def get_similarity_scores(keyword_embeddings, keyphrase_embeddings):\n",
|
| 154 |
+
" similarity_scores = cosine_similarity(keyphrase_embeddings, keyword_embeddings).max(axis=1).astype(str).tolist()\n",
|
| 155 |
+
" similarity_max = cosine_similarity(keyphrase_embeddings, keyword_embeddings).flatten().max().astype(str)\n",
|
| 156 |
+
" return similarity_scores, similarity_max\n",
|
| 157 |
+
"\n",
|
| 158 |
+
"# Define function to get the redirected URL (if any) for a given URL\n",
|
| 159 |
+
"def get_redirected_url(url_record, headers, expected_codes=(301, 302, 303, 307), timeout=60):\n",
|
| 160 |
+
" try:\n",
|
| 161 |
+
" res = requests.head(url_record['url'], headers=headers, timeout=timeout)\n",
|
| 162 |
+
" if res.status_code in expected_codes:\n",
|
| 163 |
+
" url_record['url'] = res.headers['location']\n",
|
| 164 |
+
" elif res.status_code == 200:\n",
|
| 165 |
+
" url_record['url'] = url_record['url']\n",
|
| 166 |
+
" else:\n",
|
| 167 |
+
" print(f\"Retrieving {url_record['url']} failed: Expected {expected_codes}, but received {res.status_code}: {res.reason}\")\n",
|
| 168 |
+
" except requests.exceptions.Timeout:\n",
|
| 169 |
+
" print(f\"\\nRequest timed out for {url_record['url']}\")\n",
|
| 170 |
+
" return url_record\n",
|
| 171 |
+
" except:\n",
|
| 172 |
+
" return url_record\n",
|
| 173 |
+
"\n",
|
| 174 |
+
" return url_record\n",
|
| 175 |
+
"\n",
|
| 176 |
+
"# Define function to get the HTML body of a given URL\n",
|
| 177 |
+
"def get_html_body(url, headers):\n",
|
| 178 |
+
" try:\n",
|
| 179 |
+
" response = requests.get(url, headers=headers, timeout=10)\n",
|
| 180 |
+
" html = response.content\n",
|
| 181 |
+
" soup = BeautifulSoup(html, 'html.parser')\n",
|
| 182 |
+
" return soup.body.get_text()\n",
|
| 183 |
+
" except:\n",
|
| 184 |
+
" return ''\n",
|
| 185 |
+
"\n",
|
| 186 |
+
"# Define function to write data to the Excel sheet\n",
|
| 187 |
+
"def write_data_to_excel(url_dict, filename):\n",
|
| 188 |
+
" # Create a new Excel workbook and worksheet\n",
|
| 189 |
+
" workbook = Workbook()\n",
|
| 190 |
+
" worksheet = workbook.active\n",
|
| 191 |
+
" worksheet.title = 'RSS Feeds'\n",
|
| 192 |
+
"\n",
|
| 193 |
+
" # Write the headers for the Excel sheet\n",
|
| 194 |
+
" worksheet.cell(row=1, column=1, value='Feed Name')\n",
|
| 195 |
+
" worksheet.cell(row=1, column=2, value='URL')\n",
|
| 196 |
+
" worksheet.cell(row=1, column=3, value='Updated')\n",
|
| 197 |
+
" worksheet.cell(row=1, column=4, value='Keyphrases')\n",
|
| 198 |
+
" worksheet.cell(row=1, column=5, value='Similarity to supplied keywords')\n",
|
| 199 |
+
" worksheet.cell(row=1, column=6, value='Similarity (max)')\n",
|
| 200 |
+
" worksheet.cell(row=1, column=7, value='HTML Body')\n",
|
| 201 |
+
"\n",
|
| 202 |
+
" # Loop over the unique URLs and write them to the Excel sheet\n",
|
| 203 |
+
" row_num = 2\n",
|
| 204 |
+
" for url, data in url_dict.items():\n",
|
| 205 |
+
" worksheet.cell(row=row_num, column=1, value=data['feed_name'])\n",
|
| 206 |
+
" worksheet.cell(row=row_num, column=2, value=url)\n",
|
| 207 |
+
" worksheet.cell(row=row_num, column=3, value=data['updated'])\n",
|
| 208 |
+
" worksheet.cell(row=row_num, column=4, value=data['keyphrases'])\n",
|
| 209 |
+
" worksheet.cell(row=row_num, column=5, value=data['similarity'])\n",
|
| 210 |
+
" worksheet.cell(row=row_num, column=6, value=data['similarity_max'])\n",
|
| 211 |
+
" worksheet.cell(row=row_num, column=7, value=data['html_body'])\n",
|
| 212 |
+
"\n",
|
| 213 |
+
" row_num += 1\n",
|
| 214 |
+
"\n",
|
| 215 |
+
" worksheet.freeze_panes = 'A2'\n",
|
| 216 |
+
"\n",
|
| 217 |
+
" # Set the number format for column A, except the first row\n",
|
| 218 |
+
" for row in worksheet.iter_rows(min_row=2, min_col=3, max_col=3):\n",
|
| 219 |
+
" for cell in row:\n",
|
| 220 |
+
" cell.number_format = 'mm/dd/yyyy hh:mm:ss'\n",
|
| 221 |
+
"\n",
|
| 222 |
+
" # Save the Excel workbook\n",
|
| 223 |
+
" workbook.save(filename)\n",
|
| 224 |
+
"\n",
|
| 225 |
+
" # Print confirmation message\n",
|
| 226 |
+
" #print(f'RSS output written to excel sheet: {filename}')\n",
|
| 227 |
+
"\n",
|
| 228 |
+
"def remaining_entries_from_dict(filename, dictionary):\n",
|
| 229 |
+
" pickle_data = {}\n",
|
| 230 |
+
" if os.path.exists(filename):\n",
|
| 231 |
+
" with open(filename, 'rb') as f:\n",
|
| 232 |
+
" pickle_data = pickle.load(f)\n",
|
| 233 |
+
" return list(set(dictionary.keys()) - set(pickle_data.keys()))\n",
|
| 234 |
+
"\n",
|
| 235 |
+
"def process_url(url):\n",
|
| 236 |
+
" global url_dict\n",
|
| 237 |
+
" \n",
|
| 238 |
+
" #body = get_html_body(url, headers)\n",
|
| 239 |
+
" #kph,keyphrase_embeddings = extract_keyphrases(body, kw_model, vectorizer, embedding_model)\n",
|
| 240 |
+
" #similarity, similarity_max = get_similarity_scores(keyword_embeddings, keyphrase_embeddings)\n",
|
| 241 |
+
"\n",
|
| 242 |
+
" #url_dict[url]['keyphrases'] = ', '.join(kph)\n",
|
| 243 |
+
" #url_dict[url]['similarity'] = ', '.join(similarity)\n",
|
| 244 |
+
" #url_dict[url]['similarity_max'] = similarity_max\n",
|
| 245 |
+
" #url_dict[url]['html_body'] = body\n",
|
| 246 |
+
" \n",
|
| 247 |
+
" url_dict[url]['keyphrases'] = ''\n",
|
| 248 |
+
" url_dict[url]['similarity'] = ''\n",
|
| 249 |
+
" url_dict[url]['similarity_max'] = ''\n",
|
| 250 |
+
" url_dict[url]['html_body'] = \"Skipping this part, to speed up the process\"\n",
|
| 251 |
+
"\n",
|
| 252 |
+
" # Store temporary results to disk\n",
|
| 253 |
+
" #with open(\"retrieved_urls.pkl\", 'wb') as f:\n",
|
| 254 |
+
" # pickle.dump(url_dict, f)"
|
| 255 |
+
]
|
| 256 |
+
},
|
| 257 |
+
{
|
| 258 |
+
"cell_type": "code",
|
| 259 |
+
"execution_count": null,
|
| 260 |
+
"metadata": {
|
| 261 |
+
"id": "5cHnJQDSDy1Q"
|
| 262 |
+
},
|
| 263 |
+
"outputs": [],
|
| 264 |
+
"source": [
|
| 265 |
+
"import pprint\n",
|
| 266 |
+
"from concurrent.futures import ThreadPoolExecutor, as_completed\n",
|
| 267 |
+
"from tqdm import tqdm\n",
|
| 268 |
+
"from datetime import datetime\n",
|
| 269 |
+
"import nltk\n",
|
| 270 |
+
"\n",
|
| 271 |
+
"\n",
|
| 272 |
+
"# Start the profiling timer\n",
|
| 273 |
+
"#start_time = time.time()\n",
|
| 274 |
+
"\n",
|
| 275 |
+
"# Initialize the SentenceTransformer model\n",
|
| 276 |
+
"kw_model = KeyBERT('distilbert-base-nli-mean-tokens')\n",
|
| 277 |
+
"vectorizer = KeyphraseCountVectorizer()\n",
|
| 278 |
+
"embedding_model = SentenceTransformer('distilbert-base-nli-mean-tokens')\n",
|
| 279 |
+
"nltk.download('stopwords', quiet=True)\n",
|
| 280 |
+
"\n",
|
| 281 |
+
"# Initialize variables\n",
|
| 282 |
+
"headers = {\n",
|
| 283 |
+
" 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'\n",
|
| 284 |
+
"}\n",
|
| 285 |
+
"keyword_embeddings = embedding_model.encode(keywords) # Encode keywords using the embedding model\n",
|
| 286 |
+
"\n",
|
| 287 |
+
"def filter_strings(lst1, lst2):\n",
|
| 288 |
+
" \"\"\"\n",
|
| 289 |
+
" Filters the list `lst2` and returns only the elements that have any of the elements of `lst1` as a substring.\n",
|
| 290 |
+
"\n",
|
| 291 |
+
" Args:\n",
|
| 292 |
+
" lst1 (list): The list of substrings to match against.\n",
|
| 293 |
+
" lst2 (list): The list of strings to filter.\n",
|
| 294 |
+
"\n",
|
| 295 |
+
" Returns:\n",
|
| 296 |
+
" list: A new list containing the filtered elements from `lst2`.\n",
|
| 297 |
+
"\n",
|
| 298 |
+
" Examples:\n",
|
| 299 |
+
" >>> lst1 = ['apple', 'banana', 'orange']\n",
|
| 300 |
+
" >>> lst2 = ['apple pie', 'banana bread', 'cherry pie', 'orange juice']\n",
|
| 301 |
+
" >>> filter_strings(lst1, lst2)\n",
|
| 302 |
+
" ['apple pie', 'banana bread', 'orange juice']\n",
|
| 303 |
+
" \"\"\"\n",
|
| 304 |
+
" filtered_lst2 = [s for s in lst2 if any(substring in s for substring in lst1)]\n",
|
| 305 |
+
" return filtered_lst2\n",
|
| 306 |
+
"\n",
|
| 307 |
+
"\n",
|
| 308 |
+
"def read_feeds(rss_feed, how_old):\n",
|
| 309 |
+
" global urls\n",
|
| 310 |
+
" import sys\n",
|
| 311 |
+
" import io\n",
|
| 312 |
+
" import re\n",
|
| 313 |
+
" from datetime import datetime, timedelta\n",
|
| 314 |
+
" import pytz\n",
|
| 315 |
+
"\n",
|
| 316 |
+
" old_stdout = sys.stdout\n",
|
| 317 |
+
" sys.stdout = mystdout = io.StringIO()\n",
|
| 318 |
+
"\n",
|
| 319 |
+
" # Loop over the RSS feeds and keywords\n",
|
| 320 |
+
" urls_temp = []\n",
|
| 321 |
+
" urls = []\n",
|
| 322 |
+
"\n",
|
| 323 |
+
" # Get the desired timezone\n",
|
| 324 |
+
" timezone = pytz.timezone('Europe/Stockholm') # Replace 'Your_Timezone_Here' with the desired timezone\n",
|
| 325 |
+
"\n",
|
| 326 |
+
" # Calculate the age with timezone\n",
|
| 327 |
+
" feed_item_age_minimum = datetime.now(timezone) - timedelta(days=int(how_old.split()[0]))\n",
|
| 328 |
+
"\n",
|
| 329 |
+
" feed = feedparser.parse(rss_feed)\n",
|
| 330 |
+
" for entry in tqdm(feed.entries, total=len(feed.entries), file=sys.stdout, bar_format='\\tReading feed entries: {n}/{total} ({percentage:.0f}%), time elapsed: {elapsed}'):\n",
|
| 331 |
+
" soup = BeautifulSoup(entry.summary, 'html.parser')\n",
|
| 332 |
+
" updated = datetime.strptime(entry.published, '%a, %d %b %Y %H:%M:%S %z')\n",
|
| 333 |
+
" if re.search(r'@([^ ]+)', entry.title):\n",
|
| 334 |
+
" feed_name = re.search(r'@([^ ]+)', entry.title).group(1)\n",
|
| 335 |
+
" else:\n",
|
| 336 |
+
" feed_name = ''\n",
|
| 337 |
+
" if updated > feed_item_age_minimum:\n",
|
| 338 |
+
" urls_temp.extend([{'url': link.get('href'), 'updated': updated, 'feed_name': feed_name} for link in soup.find_all('a')])\n",
|
| 339 |
+
"\n",
|
| 340 |
+
" with ThreadPoolExecutor(max_workers=4) as executor:\n",
|
| 341 |
+
" futures = [executor.submit(get_redirected_url, url, headers) for url in urls_temp]\n",
|
| 342 |
+
" for future in tqdm(as_completed(futures), total=len(futures), file=sys.stdout, bar_format='Checking URLs: {n}/{total} ({percentage:.0f}%), time elapsed: {elapsed}'):\n",
|
| 343 |
+
" urls.append(future.result())\n",
|
| 344 |
+
"\n",
|
| 345 |
+
" sys.stdout = old_stdout\n",
|
| 346 |
+
" return mystdout.getvalue()\n",
|
| 347 |
+
"\n",
|
| 348 |
+
"def read_process_urls():\n",
|
| 349 |
+
" import sys\n",
|
| 350 |
+
" import io\n",
|
| 351 |
+
" from datetime import datetime, timedelta\n",
|
| 352 |
+
" old_stdout = sys.stdout\n",
|
| 353 |
+
" sys.stdout = mystdout = io.StringIO()\n",
|
| 354 |
+
"\n",
|
| 355 |
+
" global urls\n",
|
| 356 |
+
" global url_dict\n",
|
| 357 |
+
"\n",
|
| 358 |
+
" #print(f\"Urls: {urls}\")\n",
|
| 359 |
+
" url_dict = {}\n",
|
| 360 |
+
" for item in filter_strings(feed_keywords, urls):\n",
|
| 361 |
+
" feed_name = item['feed_name']\n",
|
| 362 |
+
" updated = item['updated']\n",
|
| 363 |
+
" url = item['url']\n",
|
| 364 |
+
"\n",
|
| 365 |
+
" import pprint\n",
|
| 366 |
+
" pprint.pprint(url)\n",
|
| 367 |
+
" if url not in url_dict.keys():\n",
|
| 368 |
+
" url_dict[url] = {'updated': updated, 'feed_name': feed_name}\n",
|
| 369 |
+
" else:\n",
|
| 370 |
+
" if url_dict[url]['updated'] > updated:\n",
|
| 371 |
+
" url_dict[url]['updated'] = updated\n",
|
| 372 |
+
"\n",
|
| 373 |
+
" start_parallel_loop_time = time.time()\n",
|
| 374 |
+
" results = []\n",
|
| 375 |
+
" with ThreadPoolExecutor(max_workers=4) as executor:\n",
|
| 376 |
+
" futures = [executor.submit(process_url, url) for url in url_dict.keys()]#remaining_entries_from_dict(\"retrieved_urls.pkl\", url_dict)]\n",
|
| 377 |
+
" for future in tqdm(as_completed(futures), total=len(futures), file=sys.stdout, bar_format='Reading URLs: {n}/{total} ({percentage:.0f}%), time elapsed: {elapsed}'):\n",
|
| 378 |
+
" results.append(future.result())\n",
|
| 379 |
+
" #print(f\"Parallel URL processing: {time.time() - start_parallel_loop_time:.3f} seconds\")\n",
|
| 380 |
+
" print(f\"Total links processed: {len(url_dict.keys())}\")\n",
|
| 381 |
+
"\n",
|
| 382 |
+
" #with open(\"retrieved_urls.pkl\", 'wb') as f:\n",
|
| 383 |
+
" # pickle.dump(url_dict, f)\n",
|
| 384 |
+
"\n",
|
| 385 |
+
" # Print the time taken by the main function\n",
|
| 386 |
+
" #print(f\"Total time needed: {time.time() - start_time:.3f} seconds\")\n",
|
| 387 |
+
"\n",
|
| 388 |
+
" # Write dataset to the Excel sheet\n",
|
| 389 |
+
" write_data_to_excel(url_dict, 'newsletter_results.xlsx')\n",
|
| 390 |
+
"\n",
|
| 391 |
+
" sys.stdout = old_stdout\n",
|
| 392 |
+
" return mystdout.getvalue()"
|
| 393 |
+
]
|
| 394 |
+
},
|
| 395 |
+
{
|
| 396 |
+
"cell_type": "code",
|
| 397 |
+
"execution_count": null,
|
| 398 |
+
"metadata": {
|
| 399 |
+
"id": "FNR1jfm-jsgb"
|
| 400 |
+
},
|
| 401 |
+
"outputs": [],
|
| 402 |
+
"source": [
|
| 403 |
+
"from ipywidgets import HTML\n",
|
| 404 |
+
"\n",
|
| 405 |
+
"read_feeds(feed, feed_age)\n",
|
| 406 |
+
"display(HTML(f\"Total links examined: {len(urls)}\"))\n",
|
| 407 |
+
"\n",
|
| 408 |
+
"read_process_urls()\n",
|
| 409 |
+
"display(HTML(f\"Relevant links found: {len(url_dict.keys())}\"))\n",
|
| 410 |
+
"display(HTML(f\"------------------------------\"))\n",
|
| 411 |
+
"\n",
|
| 412 |
+
"for url in url_dict.keys():\n",
|
| 413 |
+
" #print(url)\n",
|
| 414 |
+
" display(HTML(f\"{url}\"))\n"
|
| 415 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 416 |
}
|
| 417 |
+
],
|
| 418 |
+
"metadata": {
|
| 419 |
+
"accelerator": "GPU",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 420 |
"colab": {
|
| 421 |
+
"provenance": []
|
| 422 |
},
|
| 423 |
+
"gpuClass": "standard",
|
| 424 |
+
"kernelspec": {
|
| 425 |
+
"display_name": "Python 3 (ipykernel)",
|
| 426 |
+
"language": "python",
|
| 427 |
+
"name": "python3"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
},
|
| 429 |
+
"language_info": {
|
| 430 |
+
"codemirror_mode": {
|
| 431 |
+
"name": "ipython",
|
| 432 |
+
"version": 3
|
| 433 |
+
},
|
| 434 |
+
"file_extension": ".py",
|
| 435 |
+
"mimetype": "text/x-python",
|
| 436 |
+
"name": "python",
|
| 437 |
+
"nbconvert_exporter": "python",
|
| 438 |
+
"pygments_lexer": "ipython3",
|
| 439 |
+
"version": "3.9.7"
|
| 440 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 441 |
},
|
| 442 |
+
"nbformat": 4,
|
| 443 |
+
"nbformat_minor": 0
|
| 444 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vc_newsletter_results.xlsx
DELETED
|
Binary file (5.51 kB)
|
|
|