feat(disclosures): wrap everything in classes and async, connect telegram and db
This commit is contained in:
		| @@ -1,31 +1,110 @@ | |||||||
| import requests | import requests | ||||||
| from bs4 import BeautifulSoup | from bs4 import BeautifulSoup | ||||||
| import os | import os | ||||||
|  | from telegram_bot import Telegram | ||||||
|  | from db import DB | ||||||
|  | from dotenv import load_dotenv | ||||||
|  | import asyncio | ||||||
|  |  | ||||||
| url = 'https://disclosures-clerk.house.gov' | load_dotenv() | ||||||
| data = {"LastName": "pelosi"} |  | ||||||
| response = requests.post(f'{url}/FinancialDisclosure/ViewMemberSearchResult', data=data) |  | ||||||
|  |  | ||||||
| createdDocumentUrls = {} |  | ||||||
| if 'documentUrls.txt' in os.listdir(): |  | ||||||
|     with open('documentUrls.txt', 'r') as f: |  | ||||||
|         createdDocumentUrls = eval(f.read()) |  | ||||||
|  |  | ||||||
| parsed_html = BeautifulSoup(response.text, 'html.parser') | class Disclosures: | ||||||
| fillings = parsed_html.find_all('tr', attrs={'role':'row'}) |     def __init__(self, telegram_api_key, telegram_channel, db_name, schema_path): | ||||||
|  |         self.telegram = Telegram(telegram_api_key, telegram_channel) | ||||||
|  |         self.db = DB(db_name, schema_path) | ||||||
|  |  | ||||||
|  |     async def send_message(self, message, return_value=True): | ||||||
|  |         try: | ||||||
|  |             await self.telegram.send_message(message) | ||||||
|  |             return return_value | ||||||
|  |         except Exception as e: | ||||||
|  |             return False | ||||||
|  |  | ||||||
|  |     def getDocuments(self, name="pelosi"): | ||||||
|  |         disclosures_url = "https://disclosures-clerk.house.gov" | ||||||
|  |         data = {"LastName": name} | ||||||
|  |         response = requests.post( | ||||||
|  |             f"{disclosures_url}/FinancialDisclosure/ViewMemberSearchResult", data=data | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |         parsed_html = BeautifulSoup(response.text, "html.parser") | ||||||
|  |         fillings = parsed_html.find_all("tr", attrs={"role": "row"}) | ||||||
|         fillings.pop(0) |         fillings.pop(0) | ||||||
|  |  | ||||||
|         # sort fillings by year |         # sort fillings by year | ||||||
| fillings.sort(key=lambda x: int(x.find_all('td', attrs={"data-label": "Filing Year"})[0].text)) |         fillings.sort( | ||||||
| documentUrls = {} |             key=lambda x: int( | ||||||
|  |                 x.find_all("td", attrs={"data-label": "Filing Year"})[0].text | ||||||
|  |             ) | ||||||
|  |         ) | ||||||
|  |         documents = [] | ||||||
|         for filling in fillings: |         for filling in fillings: | ||||||
|     key = filling.find_all('td', attrs={"data-label": "Filing Year"})[0].text |             year = self.formatString( | ||||||
|     url = f'{url}/{filling.a.get("href")}' |                 filling.find_all("td", attrs={"data-label": "Filing Year"})[0].text | ||||||
|     arr = documentUrls.get(key, []) |             ) | ||||||
|     documentUrls[key] = arr + [url] |             name = self.formatString( | ||||||
|  |                 filling.find_all("td", attrs={"data-label": "Name"})[0].text | ||||||
|  |             ) | ||||||
|  |             filing = self.formatString( | ||||||
|  |                 filling.find_all("td", attrs={"data-label": "Filing"})[0].text | ||||||
|  |             ) | ||||||
|  |             url = f'{disclosures_url}/{filling.a.get("href")}' | ||||||
|  |             documents.append({"name": name, "year": year, "filing": filing, "url": url}) | ||||||
|  |  | ||||||
|  |         return documents | ||||||
|  |  | ||||||
|  |     def formatString(self, text): | ||||||
|  |         # remove \n and \t | ||||||
|  |         text = text.replace("\n", " ").replace("\t", " ") | ||||||
|  |         # remove extra spaces | ||||||
|  |         text = " ".join(text.split()) | ||||||
|  |         return text | ||||||
|  |  | ||||||
|  |     def prepareValues(self, documents): | ||||||
|  |         already_inserted = self.db.query("SELECT link FROM disclosures;") | ||||||
|  |         already_inserted = [x[0] for x in already_inserted] | ||||||
|  |         values = [] | ||||||
|  |         for member_id, d in documents.items(): | ||||||
|  |             for document in d: | ||||||
|  |                 if document["url"] not in already_inserted: | ||||||
|  |                     values.append( | ||||||
|  |                         ( | ||||||
|  |                             member_id, | ||||||
|  |                             int(document["year"]), | ||||||
|  |                             document["filing"], | ||||||
|  |                             document["url"], | ||||||
|  |                         ) | ||||||
|  |                     ) | ||||||
|  |  | ||||||
|  |         return values | ||||||
|  |  | ||||||
|  |     def insertDisclosures(self, values): | ||||||
|  |         self.db.insertDisclosures(values) | ||||||
|  |  | ||||||
|  |     async def run(self): | ||||||
|  |         members = self.db.query("SELECT * FROM members;") | ||||||
|  |         documents = {} | ||||||
|  |         for id, name in members: | ||||||
|  |             documents[id] = self.getDocuments(name) | ||||||
|  |         values = self.prepareValues(documents) | ||||||
|  |         tasks = [] | ||||||
|  |         sent = [] | ||||||
|  |         for v in values: | ||||||
|  |             message = f"New disclosure from {[name for id, name in members if id == v[0]][0]} for the year {v[1]}. {v[2]} {v[3]}" | ||||||
|  |             tasks.append(asyncio.create_task(self.send_message(message, v))) | ||||||
|  |             await asyncio.sleep(0.1) | ||||||
|  |  | ||||||
|  |         results = await asyncio.gather(*tasks) | ||||||
|  |         sent = [r for r in results if r] | ||||||
|  |  | ||||||
|  |         self.insertDisclosures(sent) | ||||||
|  |  | ||||||
|  |  | ||||||
| print(len(documentUrls)) | d = Disclosures( | ||||||
| # save the documentUrls to a file |     os.getenv("TELEGRAM_API_KEY"), | ||||||
| with open('documentUrls.txt', 'w') as f: |     os.getenv("TELEGRAM_CHANNEL_ID"), | ||||||
|     f.write(str(documentUrls)) |     os.getenv("DB_PATH"), | ||||||
|  |     os.getenv("SCHEMA_PATH"), | ||||||
|  | ) | ||||||
|  | asyncio.run(d.run()) | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user