Spaces:

georgiad
/

newsletter_helper

Build error

App Files Files Community

georgiad commited on May 5, 2023

Commit

f0bc6d4

1 Parent(s): 8f98272

Switched to single feed

Browse files

Files changed (2) hide show

newsletter_production_mercury.ipynb +436 -485
vc_newsletter_results.xlsx +0 -0

newsletter_production_mercury.ipynb CHANGED Viewed

@@ -1,493 +1,444 @@
 {
- "cells": [
-  {
-   "cell_type": "raw",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "title: Newsletter Helper\n",
-    "description: Follow the instructions on screen\n",
-    "show-code: false\n",
-    "params:\n",
-    "    feed_names:\n",
-    "        label: Sources\n",
-    "        input: select\n",
-    "        value: ['a16z.com/',\n",
-    "             'sequoiacap.com/article',\n",
-    "             'zettavp.com/playbook/',\n",
-    "             'atomico.com/insights/',\n",
-    "             'nt-z.ro/',\n",
-    "             'accel.com/noteworthy',\n",
-    "             'felicis.com/',\n",
-    "             'scalevp.com/blog/',\n",
-    "             'redpoint.com/start/',\n",
-    "             '83north.com/',\n",
-    "             'bvp.com/atlas/']\n",
-    "        choices: ['a16z.com/',\n",
-    "             'sequoiacap.com/article',\n",
-    "             'zettavp.com/playbook/',\n",
-    "             'atomico.com/insights/',\n",
-    "             'nt-z.ro/',\n",
-    "             'accel.com/noteworthy',\n",
-    "             'felicis.com/',\n",
-    "             'scalevp.com/blog/',\n",
-    "             'redpoint.com/start/',\n",
-    "             '83north.com/',\n",
-    "             'bvp.com/atlas/']\n",
-    "        multi: True\n",
-    "    feed_age:\n",
-    "        label: How old?\n",
-    "        input: select\n",
-    "        value: '7 days'\n",
-    "        choices: ['7 days', '14 days', '30 days']\n",
-    "        multi: False\n",
-    "---"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "id": "pfJ5NpqjCT1U"
-   },
-   "outputs": [],
-   "source": [
-    "feed_names = ['a16z.com/',\n",
-    "             'sequoiacap.com/article',\n",
-    "             'zettavp.com/playbook/',\n",
-    "             'atomico.com/insights/',\n",
-    "             'nt-z.ro/',\n",
-    "             'accel.com/noteworthy',\n",
-    "             'felicis.com/',\n",
-    "             'scalevp.com/blog/',\n",
-    "             'redpoint.com/start/',\n",
-    "             '83north.com/',\n",
-    "             'bvp.com/atlas/']\n",
-    "feed_age = '28 days'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "\",\\n             'sequoiacap.com/article': 'https://nitter.kavin.rocks/sequoia/rss',\\n             'zettavp.com/playbook/': 'https://nitter.kavin.rocks/ZettaVentures/rss',\\n             'atomico.com/insights/': 'https://nitter.kavin.rocks/atomico/rss',\\n             'nt-z.ro/': 'https://nitter.kavin.rocks/Breakthrough/rss',\\n             'accel.com/noteworthy': 'https://nitter.kavin.rocks/Accel/rss',\\n             'felicis.com/': 'https://nitter.kavin.rocks/felicis/rss',\\n             'scalevp.com/blog/': 'https://nitter.kavin.rocks/scalevp/rss',\\n             'redpoint.com/start/': 'https://nitter.kavin.rocks/Redpoint/rss',\\n             '83north.com/': 'https://nitter.kavin.rocks/83NorthVC/rss',\\n             'bvp.com/atlas/': 'https://nitter.kavin.rocks/BessemerVP/rss'}\\n\""
       ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
     }
-   ],
-   "source": [
-    "keywords = [\"Electro mobility\",\n",
-    "            \"Batteries \",\n",
-    "            \"Battery Management systems\",\n",
-    "            \"Lidars\",\n",
-    "            \"RADARS\",\n",
-    "            \"AI\",\n",
-    "            \"Industrial AI\",\n",
-    "            \"Transportation\",\n",
-    "            \"Mobility\",\n",
-    "            \"Climate Tech\",\n",
-    "            \"Sustainable grid\",\n",
-    "            \"Sensor fusion\",\n",
-    "            \"Computer vision\",\n",
-    "            \"Data Analytics\",\n",
-    "            \"Digital Twins\",\n",
-    "            \"Automotive Cybersecurity\",\n",
-    "            \"Logistics\",\n",
-    "            \"Ports\",\n",
-    "            \"Construction sites\",\n",
-    "            \"Mines\",\n",
-    "            \"Quarries\",\n",
-    "            \"Trucks\",\n",
-    "            \"Power train\",\n",
-    "            \"Software defined vehicle\"]\n",
-    "'''\n",
-    "feed_dict = {'a16z.com/': 'https://rssbox.us-west-2.elasticbeanstalk.com/twitter/64844802/a16z',\n",
-    "             'sequoiacap.com/article': 'https://rssbox.us-west-2.elasticbeanstalk.com/twitter/24775410/sequoia',\n",
-    "             'zettavp.com/playbook/': 'https://rssbox.us-west-2.elasticbeanstalk.com/twitter/3952740738/ZettaVentures',\n",
-    "             'atomico.com/insights/': 'https://rssbox.us-west-2.elasticbeanstalk.com/twitter/44579473/atomico',\n",
-    "             'nt-z.ro/': 'https://rssbox.us-west-2.elasticbeanstalk.com/twitter/793585473956425728/Breakthrough',\n",
-    "             'accel.com/noteworthy': 'https://rssbox.us-west-2.elasticbeanstalk.com/twitter/33846451/Accel',\n",
-    "             'felicis.com/': 'https://rssbox.us-west-2.elasticbeanstalk.com/twitter/382162909/felicis',\n",
-    "             'scalevp.com/blog/': 'https://rssbox.us-west-2.elasticbeanstalk.com/twitter/17904605/scalevp',\n",
-    "             'redpoint.com/start/': 'https://rssbox.us-west-2.elasticbeanstalk.com/twitter/104286459/Redpoint',\n",
-    "             '83north.com/': 'https://rssbox.us-west-2.elasticbeanstalk.com/twitter/2963095189/83NorthVC',\n",
-    "             'bvp.com/atlas/': 'https://rssbox.us-west-2.elasticbeanstalk.com/twitter/59166197/BessemerVP'}\n",
-    "'''\n",
-    "\n",
-    "feed_dict = {'a16z.com/': 'https://nitter.kavin.rocks/a16z/rss'}\n",
-    "''',\n",
-    "             'sequoiacap.com/article': 'https://nitter.kavin.rocks/sequoia/rss',\n",
-    "             'zettavp.com/playbook/': 'https://nitter.kavin.rocks/ZettaVentures/rss',\n",
-    "             'atomico.com/insights/': 'https://nitter.kavin.rocks/atomico/rss',\n",
-    "             'nt-z.ro/': 'https://nitter.kavin.rocks/Breakthrough/rss',\n",
-    "             'accel.com/noteworthy': 'https://nitter.kavin.rocks/Accel/rss',\n",
-    "             'felicis.com/': 'https://nitter.kavin.rocks/felicis/rss',\n",
-    "             'scalevp.com/blog/': 'https://nitter.kavin.rocks/scalevp/rss',\n",
-    "             'redpoint.com/start/': 'https://nitter.kavin.rocks/Redpoint/rss',\n",
-    "             '83north.com/': 'https://nitter.kavin.rocks/83NorthVC/rss',\n",
-    "             'bvp.com/atlas/': 'https://nitter.kavin.rocks/BessemerVP/rss'}\n",
-    "'''"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "id": "Ig5nSCbI6yuL"
-   },
-   "outputs": [],
-   "source": [
-    "from keybert import KeyBERT\n",
-    "import pandas as pd\n",
-    "from keyphrase_vectorizers import KeyphraseCountVectorizer\n",
-    "from sentence_transformers import SentenceTransformer\n",
-    "import numpy as np\n",
-    "from sklearn.metrics.pairwise import cosine_similarity\n",
-    "\n",
-    "import feedparser\n",
-    "import requests\n",
-    "from bs4 import BeautifulSoup\n",
-    "from openpyxl import Workbook\n",
-    "import time\n",
-    "import pickle\n",
-    "import os\n",
-    "from tqdm import tqdm\n",
-    "from concurrent.futures import ThreadPoolExecutor\n",
-    "#from functools import lru_cache\n",
-    "\n",
-    "# Define function to extract keywords from the HTML body using the YAKE keyword extractor\n",
-    "def extract_keyphrases(text, kw_model, vectorizer, embedding_model):\n",
-    "    kph = [kw for kw, score in kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english', vectorizer=vectorizer, use_mmr=True)]\n",
-    "    keyphrase_embeddings = embedding_model.encode(kph)\n",
-    "    return kph, keyphrase_embeddings\n",
-    "\n",
-    "def get_similarity_scores(keyword_embeddings, keyphrase_embeddings):\n",
-    "    similarity_scores = cosine_similarity(keyphrase_embeddings, keyword_embeddings).max(axis=1).astype(str).tolist()\n",
-    "    similarity_max = cosine_similarity(keyphrase_embeddings, keyword_embeddings).flatten().max().astype(str)\n",
-    "    return similarity_scores, similarity_max\n",
-    "\n",
-    "# Define function to get the redirected URL (if any) for a given URL\n",
-    "def get_redirected_url(url_record, headers, expected_codes=(301, 302, 303, 307), timeout=60):\n",
-    "  try:\n",
-    "    res = requests.head(url_record['url'], headers=headers, timeout=timeout)\n",
-    "    if res.status_code in expected_codes:\n",
-    "      url_record['url'] = res.headers['location']\n",
-    "    elif res.status_code == 200:\n",
-    "      url_record['url'] = url_record['url']\n",
-    "    else:\n",
-    "      print(f\"Retrieving {url_record['url']} failed: Expected {expected_codes}, but received {res.status_code}: {res.reason}\")\n",
-    "  except requests.exceptions.Timeout:\n",
-    "    print(f\"\\nRequest timed out for {url_record['url']}\")\n",
-    "    return url_record\n",
-    "  except:\n",
-    "    return url_record\n",
-    "\n",
-    "  return url_record\n",
-    "\n",
-    "# Define function to get the HTML body of a given URL\n",
-    "def get_html_body(url, headers):\n",
-    "    try:\n",
-    "        response = requests.get(url, headers=headers, timeout=10)\n",
-    "        html = response.content\n",
-    "        soup = BeautifulSoup(html, 'html.parser')\n",
-    "        return soup.body.get_text()\n",
-    "    except:\n",
-    "        return ''\n",
-    "\n",
-    "# Define function to write data to the Excel sheet\n",
-    "def write_data_to_excel(url_dict, filename):\n",
-    "    # Create a new Excel workbook and worksheet\n",
-    "    workbook = Workbook()\n",
-    "    worksheet = workbook.active\n",
-    "    worksheet.title = 'RSS Feeds'\n",
-    "\n",
-    "    # Write the headers for the Excel sheet\n",
-    "    worksheet.cell(row=1, column=1, value='Feed Name')\n",
-    "    worksheet.cell(row=1, column=2, value='URL')\n",
-    "    worksheet.cell(row=1, column=3, value='Updated')\n",
-    "    worksheet.cell(row=1, column=4, value='Keyphrases')\n",
-    "    worksheet.cell(row=1, column=5, value='Similarity to supplied keywords')\n",
-    "    worksheet.cell(row=1, column=6, value='Similarity (max)')\n",
-    "    worksheet.cell(row=1, column=7, value='HTML Body')\n",
-    "\n",
-    "    # Loop over the unique URLs and write them to the Excel sheet\n",
-    "    row_num = 2\n",
-    "    for url, data in url_dict.items():\n",
-    "        worksheet.cell(row=row_num, column=1, value=data['feed_name'])\n",
-    "        worksheet.cell(row=row_num, column=2, value=url)\n",
-    "        worksheet.cell(row=row_num, column=3, value=data['updated'])\n",
-    "        worksheet.cell(row=row_num, column=4, value=data['keyphrases'])\n",
-    "        worksheet.cell(row=row_num, column=5, value=data['similarity'])\n",
-    "        worksheet.cell(row=row_num, column=6, value=data['similarity_max'])\n",
-    "        worksheet.cell(row=row_num, column=7, value=data['html_body'])\n",
-    "\n",
-    "        row_num += 1\n",
-    "\n",
-    "    worksheet.freeze_panes = 'A2'\n",
-    "\n",
-    "    # Set the number format for column A, except the first row\n",
-    "    for row in worksheet.iter_rows(min_row=2, min_col=3, max_col=3):\n",
-    "        for cell in row:\n",
-    "            cell.number_format = 'mm/dd/yyyy hh:mm:ss'\n",
-    "\n",
-    "    # Save the Excel workbook\n",
-    "    workbook.save(filename)\n",
-    "\n",
-    "    # Print confirmation message\n",
-    "    #print(f'RSS output written to excel sheet: {filename}')\n",
-    "\n",
-    "def remaining_entries_from_dict(filename, dictionary):\n",
-    "    pickle_data = {}\n",
-    "    if os.path.exists(filename):\n",
-    "        with open(filename, 'rb') as f:\n",
-    "            pickle_data = pickle.load(f)\n",
-    "    return list(set(dictionary.keys()) - set(pickle_data.keys()))\n",
-    "\n",
-    "def process_url(url):\n",
-    "    global url_dict\n",
-    "    \n",
-    "    #body = get_html_body(url, headers)\n",
-    "    #kph,keyphrase_embeddings = extract_keyphrases(body, kw_model, vectorizer, embedding_model)\n",
-    "    #similarity, similarity_max = get_similarity_scores(keyword_embeddings, keyphrase_embeddings)\n",
-    "\n",
-    "    #url_dict[url]['keyphrases'] = ', '.join(kph)\n",
-    "    #url_dict[url]['similarity'] = ', '.join(similarity)\n",
-    "    #url_dict[url]['similarity_max'] = similarity_max\n",
-    "    #url_dict[url]['html_body'] = body\n",
-    "    \n",
-    "    url_dict[url]['keyphrases'] = ''\n",
-    "    url_dict[url]['similarity'] = ''\n",
-    "    url_dict[url]['similarity_max'] = ''\n",
-    "    url_dict[url]['html_body'] = \"Skipping this part, to speed up the process\"\n",
-    "\n",
-    "    # Store temporary results to disk\n",
-    "    #with open(\"retrieved_urls.pkl\", 'wb') as f:\n",
-    "    #    pickle.dump(url_dict, f)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
     "colab": {
-     "base_uri": "https://localhost:8080/"
     },
-    "id": "5cHnJQDSDy1Q",
-    "outputId": "53774e96-c10f-4d36-8864-2f9e1accbb87"
-   },
-   "outputs": [],
-   "source": [
-    "import pprint\n",
-    "from concurrent.futures import ThreadPoolExecutor, as_completed\n",
-    "from tqdm import tqdm\n",
-    "from datetime import datetime\n",
-    "import nltk\n",
-    "\n",
-    "\n",
-    "# Start the profiling timer\n",
-    "start_time = time.time()\n",
-    "\n",
-    "# Initialize the SentenceTransformer model\n",
-    "kw_model = KeyBERT('distilbert-base-nli-mean-tokens')\n",
-    "vectorizer = KeyphraseCountVectorizer()\n",
-    "embedding_model = SentenceTransformer('distilbert-base-nli-mean-tokens')\n",
-    "nltk.download('stopwords', quiet=True)\n",
-    "\n",
-    "# Initialize variables\n",
-    "headers = {\n",
-    "    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'\n",
-    "}\n",
-    "keyword_embeddings = embedding_model.encode(keywords) # Encode keywords using the embedding model\n",
-    "\n",
-    "def read_feeds(feed_list, how_old):\n",
-    "    global urls\n",
-    "    import sys\n",
-    "    import io\n",
-    "    from datetime import datetime, timedelta\n",
-    "\n",
-    "    old_stdout = sys.stdout\n",
-    "    sys.stdout = mystdout = io.StringIO()\n",
-    "    print(\"A small change\")\n",
-    "    # Loop over the RSS feeds and keywords\n",
-    "    urls_temp = []\n",
-    "    urls = []\n",
-    "    feed_item_age_minimum = datetime.now() - timedelta(days=int(how_old.split()[0]))\n",
-    "\n",
-    "    for keyword, rss_feed in tqdm(feed_list.items(), total=len(feed_list.items()),  file=sys.stdout, bar_format='Reading feeds: {n}/{total} ({percentage:.0f}%), time elapsed: {elapsed}'):\n",
-    "        print(rss_feed)\n",
-    "        feed_name = rss_feed.split('/')[-1]\n",
-    "        feed = feedparser.parse(rss_feed)\n",
-    "        for entry in tqdm(feed.entries, total=len(feed.entries),  file=sys.stdout, bar_format='\\tReading feed entries: {n}/{total} ({percentage:.0f}%), time elapsed: {elapsed}'):\n",
-    "            if 'content' in entry:\n",
-    "                for content in entry.content:\n",
-    "                    soup = BeautifulSoup(content.get('value'), 'html.parser')\n",
-    "                    updated = datetime.strptime(entry.updated, '%Y-%m-%dT%H:%M:%SZ')\n",
-    "                    if updated > feed_item_age_minimum:\n",
-    "                        urls_temp.extend([{'url': link.get('href'), 'updated': updated, 'feed_name': feed_name} for link in soup.find_all('a')])\n",
-    "\n",
-    "    with ThreadPoolExecutor(max_workers=4) as executor:\n",
-    "        futures = [executor.submit(get_redirected_url, url, headers) for url in urls_temp]\n",
-    "        for future in tqdm(as_completed(futures), total=len(futures), file=sys.stdout, bar_format='Checking URLs: {n}/{total} ({percentage:.0f}%), time elapsed: {elapsed}'):\n",
-    "            urls.append(future.result())\n",
-    "\n",
-    "    sys.stdout = old_stdout\n",
-    "    return mystdout.getvalue()\n",
-    "\n",
-    "def read_process_urls():\n",
-    "    import sys\n",
-    "    import io\n",
-    "    from datetime import datetime, timedelta\n",
-    "    old_stdout = sys.stdout\n",
-    "    sys.stdout = mystdout = io.StringIO()\n",
-    "\n",
-    "    global urls\n",
-    "    global url_dict\n",
-    "    kw_for = {rss_feed.split('/')[-1]: keyword for keyword, rss_feed in feed_dict.items()}\n",
-    "\n",
-    "    #print(f\"Urls: {urls}\")\n",
-    "    url_dict = {}\n",
-    "    for item in urls:\n",
-    "        feed_name = item['feed_name']\n",
-    "        updated = item['updated']\n",
-    "        url = item['url']\n",
-    "\n",
-    "        import pprint\n",
-    "        pprint.pprint(url)\n",
-    "        pprint.pprint(kw_for[feed_name])\n",
-    "        if kw_for[feed_name] in url:\n",
-    "            if url not in url_dict.keys():\n",
-    "                url_dict[url] = {'updated': updated, 'feed_name': feed_name}\n",
-    "            else:\n",
-    "                if url_dict[url]['updated'] > updated:\n",
-    "                    url_dict[url]['updated'] = updated\n",
-    "\n",
-    "    start_parallel_loop_time = time.time()\n",
-    "    results = []\n",
-    "    with ThreadPoolExecutor(max_workers=4) as executor:\n",
-    "        futures = [executor.submit(process_url, url) for url in url_dict.keys()]#remaining_entries_from_dict(\"retrieved_urls.pkl\", url_dict)]\n",
-    "        for future in tqdm(as_completed(futures), total=len(futures),  file=sys.stdout, bar_format='Reading URLs: {n}/{total} ({percentage:.0f}%), time elapsed: {elapsed}'):\n",
-    "            results.append(future.result())\n",
-    "    #print(f\"Parallel URL processing: {time.time() - start_parallel_loop_time:.3f} seconds\")\n",
-    "    print(f\"Total links processed: {len(url_dict.keys())}\")\n",
-    "\n",
-    "    #with open(\"retrieved_urls.pkl\", 'wb') as f:\n",
-    "    #    pickle.dump(url_dict, f)\n",
-    "\n",
-    "    # Print the time taken by the main function\n",
-    "    print(f\"Total time needed: {time.time() - start_time:.3f} seconds\")\n",
-    "\n",
-    "    # Write dataset to the Excel sheet\n",
-    "    write_data_to_excel(url_dict, 'newsletter_results.xlsx')\n",
-    "\n",
-    "    sys.stdout = old_stdout\n",
-    "    return mystdout.getvalue()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/",
-     "height": 622,
-     "referenced_widgets": [
-      "55c86791cb9a491c8bb6217af108fa16",
-      "63d6badce4dc45a0a4af8660c9a96b5d",
-      "733bbc713dea4917bcdde749cafbd99c",
-      "975eed9f527e441c99ff509b5cc53bc1",
-      "3ef1bf00fdec434ebfe32715777368ff",
-      "cf14a6e4b11840bdae7166460ea8485f",
-      "1991f11a746b4c4ba005ae4eee9ae0ad",
-      "4835d51282334101ac67c821bd5d628c",
-      "106ead77e8064e65844b810e3f5d574d",
-      "6537a1a32eab44b08900a48a6f005dd7",
-      "d66e01f3991c4ec69e3e2f9d204c3898",
-      "aa70b01aa66147f592652a30b890c888",
-      "ff0115a2cae843c1b164e4b7d179ff24",
-      "c52d63ae33c14fe9befe4debb4f7ab21",
-      "eeb751b9682144ca90b1d467a6f525aa",
-      "7347a703f72c4d459217afb92c3c05b1",
-      "583f99b9cfd945a4afaede6cfa0961db",
-      "c22b236a142144d3a0fdae9163a795d3",
-      "93719585652c49cc8c32455577423532",
-      "b30da31352da4278b8180b7f2c4080fa",
-      "7ed878a7f4df4e52989d8e16c542842b",
-      "2e7d9d0f096a4b5592ab3116766c9158",
-      "03448be097a84dd6bc0c5064166b22d0",
-      "938b6fc3ee6a4cbd95196a6e5ab111d9",
-      "5728cc854a2a4c6fa037b78889dedb20",
-      "1d8c3034625f4b598a1875b74f7efdfd",
-      "0cc60859ebd7478fad11cb51113da820",
-      "1bb65fbc92144355a0977d88d6c3bf50",
-      "81ca37de802f427791e4d0f740735d0a",
-      "a27f0a049a5b4d869d1297afac81099f",
-      "bfa325bddbec4f98bd60e436461db0af",
-      "27a8d7b324ba43798be54e7685eb345a",
-      "8db6fe180d234c7d9d60cadcfadc5bc7",
-      "8e4869a380aa4f5faa05335c16d6e972",
-      "997639d0dcc8461a89fb9bcdcfffacbe"
-     ]
     },
-    "id": "FNR1jfm-jsgb",
-    "outputId": "aae04a22-81e1-4e97-cc47-bbcc84503558"
-   },
-   "outputs": [],
-   "source": [
-    "from ipywidgets import HTML\n",
-    "\n",
-    "read_feeds(feed_dict, feed_age)\n",
-    "display(HTML(f\"Total links examined: {len(urls)}\"))\n",
-    "\n",
-    "read_process_urls()\n",
-    "display(HTML(f\"Relevant links found: {len(url_dict.keys())}\"))\n",
-    "display(HTML(f\"------------------------------\"))\n",
-    "\n",
-    "for url in url_dict.keys():\n",
-    "    #print(url)\n",
-    "    display(HTML(f\"{url}\"))\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "urls"
-   ]
-  }
- ],
- "metadata": {
-  "accelerator": "GPU",
-  "colab": {
-   "provenance": []
-  },
-  "gpuClass": "standard",
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
   },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.7"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 1
-}

 {
+  "cells": [
+    {
+      "cell_type": "raw",
+      "metadata": {
+        "id": "uIxcPJeuGGAF"
+      },
+      "source": [
+        "---\n",
+        "title: Newsletter Helper\n",
+        "description: Follow the instructions on screen\n",
+        "show-code: false\n",
+        "params:\n",
+        "    feed_keywords:\n",
+        "        label: Sources\n",
+        "        input: select\n",
+        "        value: ['a16z.com/',\n",
+        "             'sequoiacap.com/article',\n",
+        "             'zettavp.com/playbook/',\n",
+        "             'atomico.com/insights/',\n",
+        "             'nt-z.ro/',\n",
+        "             'accel.com/noteworthy',\n",
+        "             'felicis.com/',\n",
+        "             'scalevp.com/blog/',\n",
+        "             'redpoint.com/start/',\n",
+        "             '83north.com/',\n",
+        "             'bvp.com/atlas/']\n",
+        "        choices: ['a16z.com/',\n",
+        "             'sequoiacap.com/article',\n",
+        "             'zettavp.com/playbook/',\n",
+        "             'atomico.com/insights/',\n",
+        "             'nt-z.ro/',\n",
+        "             'accel.com/noteworthy',\n",
+        "             'felicis.com/',\n",
+        "             'scalevp.com/blog/',\n",
+        "             'redpoint.com/start/',\n",
+        "             '83north.com/',\n",
+        "             'bvp.com/atlas/']\n",
+        "        multi: True\n",
+        "    feed_age:\n",
+        "        label: How old?\n",
+        "        input: select\n",
+        "        value: '7 days'\n",
+        "        choices: ['7 days', '14 days', '30 days']\n",
+        "        multi: False\n",
+        "---"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {
+        "id": "pfJ5NpqjCT1U"
+      },
+      "outputs": [],
+      "source": [
+        "feed_keywords = ['a16z.com/',\n",
+        "             'sequoiacap.com/article',\n",
+        "             'zettavp.com/playbook/',\n",
+        "             'atomico.com/insights/',\n",
+        "             'nt-z.ro/',\n",
+        "             'accel.com/noteworthy',\n",
+        "             'felicis.com/',\n",
+        "             'scalevp.com/blog/',\n",
+        "             'redpoint.com/start/',\n",
+        "             '83north.com/',\n",
+        "             'bvp.com/atlas/']\n",
+        "feed_age = '28 days'"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "id": "mEOS4asyGGAI"
+      },
+      "outputs": [],
+      "source": [
+        "keywords = [\"Electro mobility\",\n",
+        "            \"Batteries \",\n",
+        "            \"Battery Management systems\",\n",
+        "            \"Lidars\",\n",
+        "            \"RADARS\",\n",
+        "            \"AI\",\n",
+        "            \"Industrial AI\",\n",
+        "            \"Transportation\",\n",
+        "            \"Mobility\",\n",
+        "            \"Climate Tech\",\n",
+        "            \"Sustainable grid\",\n",
+        "            \"Sensor fusion\",\n",
+        "            \"Computer vision\",\n",
+        "            \"Data Analytics\",\n",
+        "            \"Digital Twins\",\n",
+        "            \"Automotive Cybersecurity\",\n",
+        "            \"Logistics\",\n",
+        "            \"Ports\",\n",
+        "            \"Construction sites\",\n",
+        "            \"Mines\",\n",
+        "            \"Quarries\",\n",
+        "            \"Trucks\",\n",
+        "            \"Power train\",\n",
+        "            \"Software defined vehicle\"]\n",
+        "\n",
+        "feed = \"https://www.rssground.com/p/Newsletter\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "#!pip install keybert\n",
+        "#!pip install feedparser\n",
+        "#!pip install keyphrase_vectorizers\n",
+        "#!pip install sentence-transformers"
+      ],
+      "metadata": {
+        "id": "WMswc6FCGR9T"
+      },
+      "execution_count": 11,
+      "outputs": []
+    },
     {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "id": "Ig5nSCbI6yuL"
+      },
+      "outputs": [],
+      "source": [
+        "from keybert import KeyBERT\n",
+        "import pandas as pd\n",
+        "from keyphrase_vectorizers import KeyphraseCountVectorizer\n",
+        "from sentence_transformers import SentenceTransformer\n",
+        "import numpy as np\n",
+        "from sklearn.metrics.pairwise import cosine_similarity\n",
+        "\n",
+        "import feedparser\n",
+        "import requests\n",
+        "from bs4 import BeautifulSoup\n",
+        "from openpyxl import Workbook\n",
+        "import time\n",
+        "import pickle\n",
+        "import os\n",
+        "from tqdm import tqdm\n",
+        "from concurrent.futures import ThreadPoolExecutor\n",
+        "#from functools import lru_cache\n",
+        "\n",
+        "# Define function to extract keywords from the HTML body using the YAKE keyword extractor\n",
+        "def extract_keyphrases(text, kw_model, vectorizer, embedding_model):\n",
+        "    kph = [kw for kw, score in kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english', vectorizer=vectorizer, use_mmr=True)]\n",
+        "    keyphrase_embeddings = embedding_model.encode(kph)\n",
+        "    return kph, keyphrase_embeddings\n",
+        "\n",
+        "def get_similarity_scores(keyword_embeddings, keyphrase_embeddings):\n",
+        "    similarity_scores = cosine_similarity(keyphrase_embeddings, keyword_embeddings).max(axis=1).astype(str).tolist()\n",
+        "    similarity_max = cosine_similarity(keyphrase_embeddings, keyword_embeddings).flatten().max().astype(str)\n",
+        "    return similarity_scores, similarity_max\n",
+        "\n",
+        "# Define function to get the redirected URL (if any) for a given URL\n",
+        "def get_redirected_url(url_record, headers, expected_codes=(301, 302, 303, 307), timeout=60):\n",
+        "  try:\n",
+        "    res = requests.head(url_record['url'], headers=headers, timeout=timeout)\n",
+        "    if res.status_code in expected_codes:\n",
+        "      url_record['url'] = res.headers['location']\n",
+        "    elif res.status_code == 200:\n",
+        "      url_record['url'] = url_record['url']\n",
+        "    else:\n",
+        "      print(f\"Retrieving {url_record['url']} failed: Expected {expected_codes}, but received {res.status_code}: {res.reason}\")\n",
+        "  except requests.exceptions.Timeout:\n",
+        "    print(f\"\\nRequest timed out for {url_record['url']}\")\n",
+        "    return url_record\n",
+        "  except:\n",
+        "    return url_record\n",
+        "\n",
+        "  return url_record\n",
+        "\n",
+        "# Define function to get the HTML body of a given URL\n",
+        "def get_html_body(url, headers):\n",
+        "    try:\n",
+        "        response = requests.get(url, headers=headers, timeout=10)\n",
+        "        html = response.content\n",
+        "        soup = BeautifulSoup(html, 'html.parser')\n",
+        "        return soup.body.get_text()\n",
+        "    except:\n",
+        "        return ''\n",
+        "\n",
+        "# Define function to write data to the Excel sheet\n",
+        "def write_data_to_excel(url_dict, filename):\n",
+        "    # Create a new Excel workbook and worksheet\n",
+        "    workbook = Workbook()\n",
+        "    worksheet = workbook.active\n",
+        "    worksheet.title = 'RSS Feeds'\n",
+        "\n",
+        "    # Write the headers for the Excel sheet\n",
+        "    worksheet.cell(row=1, column=1, value='Feed Name')\n",
+        "    worksheet.cell(row=1, column=2, value='URL')\n",
+        "    worksheet.cell(row=1, column=3, value='Updated')\n",
+        "    worksheet.cell(row=1, column=4, value='Keyphrases')\n",
+        "    worksheet.cell(row=1, column=5, value='Similarity to supplied keywords')\n",
+        "    worksheet.cell(row=1, column=6, value='Similarity (max)')\n",
+        "    worksheet.cell(row=1, column=7, value='HTML Body')\n",
+        "\n",
+        "    # Loop over the unique URLs and write them to the Excel sheet\n",
+        "    row_num = 2\n",
+        "    for url, data in url_dict.items():\n",
+        "        worksheet.cell(row=row_num, column=1, value=data['feed_name'])\n",
+        "        worksheet.cell(row=row_num, column=2, value=url)\n",
+        "        worksheet.cell(row=row_num, column=3, value=data['updated'])\n",
+        "        worksheet.cell(row=row_num, column=4, value=data['keyphrases'])\n",
+        "        worksheet.cell(row=row_num, column=5, value=data['similarity'])\n",
+        "        worksheet.cell(row=row_num, column=6, value=data['similarity_max'])\n",
+        "        worksheet.cell(row=row_num, column=7, value=data['html_body'])\n",
+        "\n",
+        "        row_num += 1\n",
+        "\n",
+        "    worksheet.freeze_panes = 'A2'\n",
+        "\n",
+        "    # Set the number format for column A, except the first row\n",
+        "    for row in worksheet.iter_rows(min_row=2, min_col=3, max_col=3):\n",
+        "        for cell in row:\n",
+        "            cell.number_format = 'mm/dd/yyyy hh:mm:ss'\n",
+        "\n",
+        "    # Save the Excel workbook\n",
+        "    workbook.save(filename)\n",
+        "\n",
+        "    # Print confirmation message\n",
+        "    #print(f'RSS output written to excel sheet: {filename}')\n",
+        "\n",
+        "def remaining_entries_from_dict(filename, dictionary):\n",
+        "    pickle_data = {}\n",
+        "    if os.path.exists(filename):\n",
+        "        with open(filename, 'rb') as f:\n",
+        "            pickle_data = pickle.load(f)\n",
+        "    return list(set(dictionary.keys()) - set(pickle_data.keys()))\n",
+        "\n",
+        "def process_url(url):\n",
+        "    global url_dict\n",
+        "    \n",
+        "    #body = get_html_body(url, headers)\n",
+        "    #kph,keyphrase_embeddings = extract_keyphrases(body, kw_model, vectorizer, embedding_model)\n",
+        "    #similarity, similarity_max = get_similarity_scores(keyword_embeddings, keyphrase_embeddings)\n",
+        "\n",
+        "    #url_dict[url]['keyphrases'] = ', '.join(kph)\n",
+        "    #url_dict[url]['similarity'] = ', '.join(similarity)\n",
+        "    #url_dict[url]['similarity_max'] = similarity_max\n",
+        "    #url_dict[url]['html_body'] = body\n",
+        "    \n",
+        "    url_dict[url]['keyphrases'] = ''\n",
+        "    url_dict[url]['similarity'] = ''\n",
+        "    url_dict[url]['similarity_max'] = ''\n",
+        "    url_dict[url]['html_body'] = \"Skipping this part, to speed up the process\"\n",
+        "\n",
+        "    # Store temporary results to disk\n",
+        "    #with open(\"retrieved_urls.pkl\", 'wb') as f:\n",
+        "    #    pickle.dump(url_dict, f)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "5cHnJQDSDy1Q"
+      },
+      "outputs": [],
+      "source": [
+        "import pprint\n",
+        "from concurrent.futures import ThreadPoolExecutor, as_completed\n",
+        "from tqdm import tqdm\n",
+        "from datetime import datetime\n",
+        "import nltk\n",
+        "\n",
+        "\n",
+        "# Start the profiling timer\n",
+        "#start_time = time.time()\n",
+        "\n",
+        "# Initialize the SentenceTransformer model\n",
+        "kw_model = KeyBERT('distilbert-base-nli-mean-tokens')\n",
+        "vectorizer = KeyphraseCountVectorizer()\n",
+        "embedding_model = SentenceTransformer('distilbert-base-nli-mean-tokens')\n",
+        "nltk.download('stopwords', quiet=True)\n",
+        "\n",
+        "# Initialize variables\n",
+        "headers = {\n",
+        "    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'\n",
+        "}\n",
+        "keyword_embeddings = embedding_model.encode(keywords) # Encode keywords using the embedding model\n",
+        "\n",
+        "def filter_strings(lst1, lst2):\n",
+        "    \"\"\"\n",
+        "    Filters the list `lst2` and returns only the elements that have any of the elements of `lst1` as a substring.\n",
+        "\n",
+        "    Args:\n",
+        "        lst1 (list): The list of substrings to match against.\n",
+        "        lst2 (list): The list of strings to filter.\n",
+        "\n",
+        "    Returns:\n",
+        "        list: A new list containing the filtered elements from `lst2`.\n",
+        "\n",
+        "    Examples:\n",
+        "        >>> lst1 = ['apple', 'banana', 'orange']\n",
+        "        >>> lst2 = ['apple pie', 'banana bread', 'cherry pie', 'orange juice']\n",
+        "        >>> filter_strings(lst1, lst2)\n",
+        "        ['apple pie', 'banana bread', 'orange juice']\n",
+        "    \"\"\"\n",
+        "    filtered_lst2 = [s for s in lst2 if any(substring in s for substring in lst1)]\n",
+        "    return filtered_lst2\n",
+        "\n",
+        "\n",
+        "def read_feeds(rss_feed, how_old):\n",
+        "    global urls\n",
+        "    import sys\n",
+        "    import io\n",
+        "    import re\n",
+        "    from datetime import datetime, timedelta\n",
+        "    import pytz\n",
+        "\n",
+        "    old_stdout = sys.stdout\n",
+        "    sys.stdout = mystdout = io.StringIO()\n",
+        "\n",
+        "    # Loop over the RSS feeds and keywords\n",
+        "    urls_temp = []\n",
+        "    urls = []\n",
+        "\n",
+        "    # Get the desired timezone\n",
+        "    timezone = pytz.timezone('Europe/Stockholm')  # Replace 'Your_Timezone_Here' with the desired timezone\n",
+        "\n",
+        "    # Calculate the age with timezone\n",
+        "    feed_item_age_minimum = datetime.now(timezone) - timedelta(days=int(how_old.split()[0]))\n",
+        "\n",
+        "    feed = feedparser.parse(rss_feed)\n",
+        "    for entry in tqdm(feed.entries, total=len(feed.entries),  file=sys.stdout, bar_format='\\tReading feed entries: {n}/{total} ({percentage:.0f}%), time elapsed: {elapsed}'):\n",
+        "        soup = BeautifulSoup(entry.summary, 'html.parser')\n",
+        "        updated = datetime.strptime(entry.published, '%a, %d %b %Y %H:%M:%S %z')\n",
+        "        if re.search(r'@([^ ]+)', entry.title):\n",
+        "            feed_name = re.search(r'@([^ ]+)', entry.title).group(1)\n",
+        "        else:\n",
+        "            feed_name = ''\n",
+        "        if updated > feed_item_age_minimum:\n",
+        "            urls_temp.extend([{'url': link.get('href'), 'updated': updated, 'feed_name': feed_name} for link in soup.find_all('a')])\n",
+        "\n",
+        "    with ThreadPoolExecutor(max_workers=4) as executor:\n",
+        "        futures = [executor.submit(get_redirected_url, url, headers) for url in urls_temp]\n",
+        "        for future in tqdm(as_completed(futures), total=len(futures), file=sys.stdout, bar_format='Checking URLs: {n}/{total} ({percentage:.0f}%), time elapsed: {elapsed}'):\n",
+        "            urls.append(future.result())\n",
+        "\n",
+        "    sys.stdout = old_stdout\n",
+        "    return mystdout.getvalue()\n",
+        "\n",
+        "def read_process_urls():\n",
+        "    import sys\n",
+        "    import io\n",
+        "    from datetime import datetime, timedelta\n",
+        "    old_stdout = sys.stdout\n",
+        "    sys.stdout = mystdout = io.StringIO()\n",
+        "\n",
+        "    global urls\n",
+        "    global url_dict\n",
+        "\n",
+        "    #print(f\"Urls: {urls}\")\n",
+        "    url_dict = {}\n",
+        "    for item in filter_strings(feed_keywords, urls):\n",
+        "        feed_name = item['feed_name']\n",
+        "        updated = item['updated']\n",
+        "        url = item['url']\n",
+        "\n",
+        "        import pprint\n",
+        "        pprint.pprint(url)\n",
+        "        if url not in url_dict.keys():\n",
+        "            url_dict[url] = {'updated': updated, 'feed_name': feed_name}\n",
+        "        else:\n",
+        "            if url_dict[url]['updated'] > updated:\n",
+        "                url_dict[url]['updated'] = updated\n",
+        "\n",
+        "    start_parallel_loop_time = time.time()\n",
+        "    results = []\n",
+        "    with ThreadPoolExecutor(max_workers=4) as executor:\n",
+        "        futures = [executor.submit(process_url, url) for url in url_dict.keys()]#remaining_entries_from_dict(\"retrieved_urls.pkl\", url_dict)]\n",
+        "        for future in tqdm(as_completed(futures), total=len(futures),  file=sys.stdout, bar_format='Reading URLs: {n}/{total} ({percentage:.0f}%), time elapsed: {elapsed}'):\n",
+        "            results.append(future.result())\n",
+        "    #print(f\"Parallel URL processing: {time.time() - start_parallel_loop_time:.3f} seconds\")\n",
+        "    print(f\"Total links processed: {len(url_dict.keys())}\")\n",
+        "\n",
+        "    #with open(\"retrieved_urls.pkl\", 'wb') as f:\n",
+        "    #    pickle.dump(url_dict, f)\n",
+        "\n",
+        "    # Print the time taken by the main function\n",
+        "    #print(f\"Total time needed: {time.time() - start_time:.3f} seconds\")\n",
+        "\n",
+        "    # Write dataset to the Excel sheet\n",
+        "    write_data_to_excel(url_dict, 'newsletter_results.xlsx')\n",
+        "\n",
+        "    sys.stdout = old_stdout\n",
+        "    return mystdout.getvalue()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "FNR1jfm-jsgb"
+      },
+      "outputs": [],
+      "source": [
+        "from ipywidgets import HTML\n",
+        "\n",
+        "read_feeds(feed, feed_age)\n",
+        "display(HTML(f\"Total links examined: {len(urls)}\"))\n",
+        "\n",
+        "read_process_urls()\n",
+        "display(HTML(f\"Relevant links found: {len(url_dict.keys())}\"))\n",
+        "display(HTML(f\"------------------------------\"))\n",
+        "\n",
+        "for url in url_dict.keys():\n",
+        "    #print(url)\n",
+        "    display(HTML(f\"{url}\"))\n"
       ]
     }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
     "colab": {
+      "provenance": []
     },
+    "gpuClass": "standard",
+    "kernelspec": {
+      "display_name": "Python 3 (ipykernel)",
+      "language": "python",
+      "name": "python3"
     },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.9.7"
+    }
   },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}

vc_newsletter_results.xlsx DELETED Viewed

Binary file (5.51 kB)