We presented this tool and notebook as part of our workshop on Computational Approaches to Fight Human Trafficking.
We use requests and BeautifulSoup to perform the following steps:
# CrossCompute
url = 'https://www.unodc.org'
target_folder = '/tmp'
import requests
response = requests.get(url)
html = response.content
html[:200]
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
title = soup.find('title')
print('title = ' + title.text)
body = soup.find('body')
# Remove script content
for x in body.find_all('script'):
x.decompose()
# Extract text without tags
text = body.getText(separator=u'\n').strip()
print(text[:70])
import re
# Replace multiple newlines with a single newline
pattern = re.compile(r'\n+', re.MULTILINE)
text = pattern.sub('\n', text)
print(text[:89])
target_path = target_folder + '/raw.txt'
open(target_path, 'wt').write(text)
print('body_text_path = ' + target_path)