feat(disclosures): wrap everything in classes and async, connect telegram and db
This commit is contained in:
@@ -1,31 +1,110 @@
|
|||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import os
|
import os
|
||||||
|
from telegram_bot import Telegram
|
||||||
|
from db import DB
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
import asyncio
|
||||||
|
|
||||||
url = 'https://disclosures-clerk.house.gov'
|
load_dotenv()
|
||||||
data = {"LastName": "pelosi"}
|
|
||||||
response = requests.post(f'{url}/FinancialDisclosure/ViewMemberSearchResult', data=data)
|
|
||||||
|
|
||||||
createdDocumentUrls = {}
|
|
||||||
if 'documentUrls.txt' in os.listdir():
|
|
||||||
with open('documentUrls.txt', 'r') as f:
|
|
||||||
createdDocumentUrls = eval(f.read())
|
|
||||||
|
|
||||||
parsed_html = BeautifulSoup(response.text, 'html.parser')
|
|
||||||
fillings = parsed_html.find_all('tr', attrs={'role':'row'})
|
|
||||||
fillings.pop(0)
|
|
||||||
|
|
||||||
# sort fillings by year
|
|
||||||
fillings.sort(key=lambda x: int(x.find_all('td', attrs={"data-label": "Filing Year"})[0].text))
|
|
||||||
documentUrls = {}
|
|
||||||
for filling in fillings:
|
|
||||||
key = filling.find_all('td', attrs={"data-label": "Filing Year"})[0].text
|
|
||||||
url = f'{url}/{filling.a.get("href")}'
|
|
||||||
arr = documentUrls.get(key, [])
|
|
||||||
documentUrls[key] = arr + [url]
|
|
||||||
|
|
||||||
|
|
||||||
print(len(documentUrls))
|
class Disclosures:
|
||||||
# save the documentUrls to a file
|
def __init__(self, telegram_api_key, telegram_channel, db_name, schema_path):
|
||||||
with open('documentUrls.txt', 'w') as f:
|
self.telegram = Telegram(telegram_api_key, telegram_channel)
|
||||||
f.write(str(documentUrls))
|
self.db = DB(db_name, schema_path)
|
||||||
|
|
||||||
|
async def send_message(self, message, return_value=True):
|
||||||
|
try:
|
||||||
|
await self.telegram.send_message(message)
|
||||||
|
return return_value
|
||||||
|
except Exception as e:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def getDocuments(self, name="pelosi"):
|
||||||
|
disclosures_url = "https://disclosures-clerk.house.gov"
|
||||||
|
data = {"LastName": name}
|
||||||
|
response = requests.post(
|
||||||
|
f"{disclosures_url}/FinancialDisclosure/ViewMemberSearchResult", data=data
|
||||||
|
)
|
||||||
|
|
||||||
|
parsed_html = BeautifulSoup(response.text, "html.parser")
|
||||||
|
fillings = parsed_html.find_all("tr", attrs={"role": "row"})
|
||||||
|
fillings.pop(0)
|
||||||
|
|
||||||
|
# sort fillings by year
|
||||||
|
fillings.sort(
|
||||||
|
key=lambda x: int(
|
||||||
|
x.find_all("td", attrs={"data-label": "Filing Year"})[0].text
|
||||||
|
)
|
||||||
|
)
|
||||||
|
documents = []
|
||||||
|
for filling in fillings:
|
||||||
|
year = self.formatString(
|
||||||
|
filling.find_all("td", attrs={"data-label": "Filing Year"})[0].text
|
||||||
|
)
|
||||||
|
name = self.formatString(
|
||||||
|
filling.find_all("td", attrs={"data-label": "Name"})[0].text
|
||||||
|
)
|
||||||
|
filing = self.formatString(
|
||||||
|
filling.find_all("td", attrs={"data-label": "Filing"})[0].text
|
||||||
|
)
|
||||||
|
url = f'{disclosures_url}/{filling.a.get("href")}'
|
||||||
|
documents.append({"name": name, "year": year, "filing": filing, "url": url})
|
||||||
|
|
||||||
|
return documents
|
||||||
|
|
||||||
|
def formatString(self, text):
|
||||||
|
# remove \n and \t
|
||||||
|
text = text.replace("\n", " ").replace("\t", " ")
|
||||||
|
# remove extra spaces
|
||||||
|
text = " ".join(text.split())
|
||||||
|
return text
|
||||||
|
|
||||||
|
def prepareValues(self, documents):
|
||||||
|
already_inserted = self.db.query("SELECT link FROM disclosures;")
|
||||||
|
already_inserted = [x[0] for x in already_inserted]
|
||||||
|
values = []
|
||||||
|
for member_id, d in documents.items():
|
||||||
|
for document in d:
|
||||||
|
if document["url"] not in already_inserted:
|
||||||
|
values.append(
|
||||||
|
(
|
||||||
|
member_id,
|
||||||
|
int(document["year"]),
|
||||||
|
document["filing"],
|
||||||
|
document["url"],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return values
|
||||||
|
|
||||||
|
def insertDisclosures(self, values):
|
||||||
|
self.db.insertDisclosures(values)
|
||||||
|
|
||||||
|
async def run(self):
|
||||||
|
members = self.db.query("SELECT * FROM members;")
|
||||||
|
documents = {}
|
||||||
|
for id, name in members:
|
||||||
|
documents[id] = self.getDocuments(name)
|
||||||
|
values = self.prepareValues(documents)
|
||||||
|
tasks = []
|
||||||
|
sent = []
|
||||||
|
for v in values:
|
||||||
|
message = f"New disclosure from {[name for id, name in members if id == v[0]][0]} for the year {v[1]}. {v[2]} {v[3]}"
|
||||||
|
tasks.append(asyncio.create_task(self.send_message(message, v)))
|
||||||
|
await asyncio.sleep(0.1)
|
||||||
|
|
||||||
|
results = await asyncio.gather(*tasks)
|
||||||
|
sent = [r for r in results if r]
|
||||||
|
|
||||||
|
self.insertDisclosures(sent)
|
||||||
|
|
||||||
|
|
||||||
|
d = Disclosures(
|
||||||
|
os.getenv("TELEGRAM_API_KEY"),
|
||||||
|
os.getenv("TELEGRAM_CHANNEL_ID"),
|
||||||
|
os.getenv("DB_PATH"),
|
||||||
|
os.getenv("SCHEMA_PATH"),
|
||||||
|
)
|
||||||
|
asyncio.run(d.run())
|
||||||
|
|||||||
Reference in New Issue
Block a user