# CrossCompute
source_url = 'https://www.theatlantic.com/health/archive/2017/10/_/543975/'
target_folder = '/tmp'
import requests
response = requests.get(source_url)
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.text, 'lxml')
text = ' '.join(x.get_text() for x in soup.find_all('p'))
text = text.replace(r'’', '\'')
text = text.replace(r'“', '"')
text = text.replace(r'”', '"')
# import spacy
# nlp = spacy.load('en')
# doc = nlp(text)
from polyglot.text import Text
document = Text(text)
# len(document.entities)
from collections import Counter
person_names = []
location_names = []
organization_names = []
for entity in document.entities:
tag = entity.tag
name = ' '.join(entity._collection)
if len(name) < 2:
continue
if tag == 'I-PER':
person_names.append(name)
elif tag == 'I-LOC':
location_names.append(name)
elif tag == 'I-ORG':
organization_names.append(name)
count_by_person_name = Counter(person_names)
count_by_location_name = Counter(location_names)
count_by_organization_name = Counter(organization_names)
import csv
from os.path import join
def save_csv(target_path, d):
with open(target_path, 'wt') as target_file:
target_writer = csv.writer(target_file)
target_writer.writerow(['name', 'count'])
rows = d.items()
for k, v in sorted(rows, key=lambda x: -x[1]):
target_writer.writerow([k, v])
target_path = join(target_folder, 'person_names.csv')
save_csv(target_path, count_by_person_name)
print('person_table_path = %s' % target_path)
target_path = join(target_folder, 'location_names.csv')
save_csv(target_path, count_by_location_name)
print('location_table_path = %s' % target_path)
target_path = join(target_folder, 'organization_names.csv')
save_csv(target_path, count_by_organization_name)
print('organization_table_path = %s' % target_path)
{ person_table : People }
{ location_table : Locations }
{ organization_table : Organizations }