From 80ab4508e2feabc06b1ee7ac5b39340c2ce5ff6d Mon Sep 17 00:00:00 2001 From: oliverhnat Date: Mon, 6 Jan 2025 11:39:46 +0100 Subject: [PATCH] feat(disclosures): basic get and save in file --- financial_disclosure.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/financial_disclosure.py b/financial_disclosure.py index e69de29..fdd5ccb 100644 --- a/financial_disclosure.py +++ b/financial_disclosure.py @@ -0,0 +1,31 @@ +import requests +from bs4 import BeautifulSoup +import os + +url = 'https://disclosures-clerk.house.gov' +data = {"LastName": "pelosi"} +response = requests.post(f'{url}/FinancialDisclosure/ViewMemberSearchResult', data=data) + +createdDocumentUrls = {} +if 'documentUrls.txt' in os.listdir(): + with open('documentUrls.txt', 'r') as f: + createdDocumentUrls = eval(f.read()) + +parsed_html = BeautifulSoup(response.text, 'html.parser') +fillings = parsed_html.find_all('tr', attrs={'role':'row'}) +fillings.pop(0) + +# sort fillings by year +fillings.sort(key=lambda x: int(x.find_all('td', attrs={"data-label": "Filing Year"})[0].text)) +documentUrls = {} +for filling in fillings: + key = filling.find_all('td', attrs={"data-label": "Filing Year"})[0].text + url = f'{url}/{filling.a.get("href")}' + arr = documentUrls.get(key, []) + documentUrls[key] = arr + [url] + + +print(len(documentUrls)) +# save the documentUrls to a file +with open('documentUrls.txt', 'w') as f: + f.write(str(documentUrls))