feat(disclosures): wrap everything in classes and async, connect telegram and db
This commit is contained in:
		| @@ -1,31 +1,110 @@ | ||||
| import requests | ||||
| from bs4 import BeautifulSoup | ||||
| import os | ||||
| from telegram_bot import Telegram | ||||
| from db import DB | ||||
| from dotenv import load_dotenv | ||||
| import asyncio | ||||
|  | ||||
| url = 'https://disclosures-clerk.house.gov' | ||||
| data = {"LastName": "pelosi"} | ||||
| response = requests.post(f'{url}/FinancialDisclosure/ViewMemberSearchResult', data=data) | ||||
|  | ||||
| createdDocumentUrls = {} | ||||
| if 'documentUrls.txt' in os.listdir(): | ||||
|     with open('documentUrls.txt', 'r') as f: | ||||
|         createdDocumentUrls = eval(f.read()) | ||||
|  | ||||
| parsed_html = BeautifulSoup(response.text, 'html.parser') | ||||
| fillings = parsed_html.find_all('tr', attrs={'role':'row'}) | ||||
| fillings.pop(0) | ||||
|  | ||||
| # sort fillings by year | ||||
| fillings.sort(key=lambda x: int(x.find_all('td', attrs={"data-label": "Filing Year"})[0].text)) | ||||
| documentUrls = {} | ||||
| for filling in fillings: | ||||
|     key = filling.find_all('td', attrs={"data-label": "Filing Year"})[0].text | ||||
|     url = f'{url}/{filling.a.get("href")}' | ||||
|     arr = documentUrls.get(key, []) | ||||
|     documentUrls[key] = arr + [url] | ||||
| load_dotenv() | ||||
|  | ||||
|  | ||||
| print(len(documentUrls)) | ||||
| # save the documentUrls to a file | ||||
| with open('documentUrls.txt', 'w') as f: | ||||
|     f.write(str(documentUrls)) | ||||
| class Disclosures: | ||||
|     def __init__(self, telegram_api_key, telegram_channel, db_name, schema_path): | ||||
|         self.telegram = Telegram(telegram_api_key, telegram_channel) | ||||
|         self.db = DB(db_name, schema_path) | ||||
|  | ||||
|     async def send_message(self, message, return_value=True): | ||||
|         try: | ||||
|             await self.telegram.send_message(message) | ||||
|             return return_value | ||||
|         except Exception as e: | ||||
|             return False | ||||
|  | ||||
|     def getDocuments(self, name="pelosi"): | ||||
|         disclosures_url = "https://disclosures-clerk.house.gov" | ||||
|         data = {"LastName": name} | ||||
|         response = requests.post( | ||||
|             f"{disclosures_url}/FinancialDisclosure/ViewMemberSearchResult", data=data | ||||
|         ) | ||||
|  | ||||
|         parsed_html = BeautifulSoup(response.text, "html.parser") | ||||
|         fillings = parsed_html.find_all("tr", attrs={"role": "row"}) | ||||
|         fillings.pop(0) | ||||
|  | ||||
|         # sort fillings by year | ||||
|         fillings.sort( | ||||
|             key=lambda x: int( | ||||
|                 x.find_all("td", attrs={"data-label": "Filing Year"})[0].text | ||||
|             ) | ||||
|         ) | ||||
|         documents = [] | ||||
|         for filling in fillings: | ||||
|             year = self.formatString( | ||||
|                 filling.find_all("td", attrs={"data-label": "Filing Year"})[0].text | ||||
|             ) | ||||
|             name = self.formatString( | ||||
|                 filling.find_all("td", attrs={"data-label": "Name"})[0].text | ||||
|             ) | ||||
|             filing = self.formatString( | ||||
|                 filling.find_all("td", attrs={"data-label": "Filing"})[0].text | ||||
|             ) | ||||
|             url = f'{disclosures_url}/{filling.a.get("href")}' | ||||
|             documents.append({"name": name, "year": year, "filing": filing, "url": url}) | ||||
|  | ||||
|         return documents | ||||
|  | ||||
|     def formatString(self, text): | ||||
|         # remove \n and \t | ||||
|         text = text.replace("\n", " ").replace("\t", " ") | ||||
|         # remove extra spaces | ||||
|         text = " ".join(text.split()) | ||||
|         return text | ||||
|  | ||||
|     def prepareValues(self, documents): | ||||
|         already_inserted = self.db.query("SELECT link FROM disclosures;") | ||||
|         already_inserted = [x[0] for x in already_inserted] | ||||
|         values = [] | ||||
|         for member_id, d in documents.items(): | ||||
|             for document in d: | ||||
|                 if document["url"] not in already_inserted: | ||||
|                     values.append( | ||||
|                         ( | ||||
|                             member_id, | ||||
|                             int(document["year"]), | ||||
|                             document["filing"], | ||||
|                             document["url"], | ||||
|                         ) | ||||
|                     ) | ||||
|  | ||||
|         return values | ||||
|  | ||||
|     def insertDisclosures(self, values): | ||||
|         self.db.insertDisclosures(values) | ||||
|  | ||||
|     async def run(self): | ||||
|         members = self.db.query("SELECT * FROM members;") | ||||
|         documents = {} | ||||
|         for id, name in members: | ||||
|             documents[id] = self.getDocuments(name) | ||||
|         values = self.prepareValues(documents) | ||||
|         tasks = [] | ||||
|         sent = [] | ||||
|         for v in values: | ||||
|             message = f"New disclosure from {[name for id, name in members if id == v[0]][0]} for the year {v[1]}. {v[2]} {v[3]}" | ||||
|             tasks.append(asyncio.create_task(self.send_message(message, v))) | ||||
|             await asyncio.sleep(0.1) | ||||
|  | ||||
|         results = await asyncio.gather(*tasks) | ||||
|         sent = [r for r in results if r] | ||||
|  | ||||
|         self.insertDisclosures(sent) | ||||
|  | ||||
|  | ||||
| d = Disclosures( | ||||
|     os.getenv("TELEGRAM_API_KEY"), | ||||
|     os.getenv("TELEGRAM_CHANNEL_ID"), | ||||
|     os.getenv("DB_PATH"), | ||||
|     os.getenv("SCHEMA_PATH"), | ||||
| ) | ||||
| asyncio.run(d.run()) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user