32 lines
		
	
	
		
			1000 B
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			32 lines
		
	
	
		
			1000 B
		
	
	
	
		
			Python
		
	
	
	
	
	
| import requests
 | |
| from bs4 import BeautifulSoup
 | |
| import os
 | |
| 
 | |
| url = 'https://disclosures-clerk.house.gov'
 | |
| data = {"LastName": "pelosi"}
 | |
| response = requests.post(f'{url}/FinancialDisclosure/ViewMemberSearchResult', data=data)
 | |
| 
 | |
| createdDocumentUrls = {}
 | |
| if 'documentUrls.txt' in os.listdir():
 | |
|     with open('documentUrls.txt', 'r') as f:
 | |
|         createdDocumentUrls = eval(f.read())
 | |
| 
 | |
| parsed_html = BeautifulSoup(response.text, 'html.parser')
 | |
| fillings = parsed_html.find_all('tr', attrs={'role':'row'})
 | |
| fillings.pop(0)
 | |
| 
 | |
| # sort fillings by year
 | |
| fillings.sort(key=lambda x: int(x.find_all('td', attrs={"data-label": "Filing Year"})[0].text))
 | |
| documentUrls = {}
 | |
| for filling in fillings:
 | |
|     key = filling.find_all('td', attrs={"data-label": "Filing Year"})[0].text
 | |
|     url = f'{url}/{filling.a.get("href")}'
 | |
|     arr = documentUrls.get(key, [])
 | |
|     documentUrls[key] = arr + [url]
 | |
| 
 | |
| 
 | |
| print(len(documentUrls))
 | |
| # save the documentUrls to a file
 | |
| with open('documentUrls.txt', 'w') as f:
 | |
|     f.write(str(documentUrls))
 |