Test Hypotheses on Human Trafficking




Pay Notebook Creator: Roy Hyunjin Han0
Set Container: Numerical CPU with TINY Memory for 10 Minutes 0
Total0

Extract Names Using Polyglot

Here we use the polyglot package to extract names from a webpage.

In [1]:
# CrossCompute
source_url = 'https://www.theatlantic.com/health/archive/2017/10/_/543975/'
target_folder = '/tmp'
In [2]:
import requests
response = requests.get(source_url)
In [3]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.text, 'lxml')
text = ' '.join(x.get_text() for x in soup.find_all('p'))
text = text.replace(r'’', '\'')
text = text.replace(r'“', '"')
text = text.replace(r'”', '"')
In [4]:
# import spacy
# nlp = spacy.load('en')
# doc = nlp(text)
In [5]:
from polyglot.text import Text
document = Text(text)
# len(document.entities)
In [6]:
from collections import Counter

person_names = []
location_names = []
organization_names = []
for entity in document.entities:
    tag = entity.tag
    name = ' '.join(entity._collection)
    if len(name) < 2:
        continue
    if tag == 'I-PER':
        person_names.append(name)
    elif tag == 'I-LOC':
        location_names.append(name)
    elif tag == 'I-ORG':
        organization_names.append(name)

count_by_person_name = Counter(person_names)
count_by_location_name = Counter(location_names)
count_by_organization_name = Counter(organization_names)
In [7]:
import csv
from os.path import join

def save_csv(target_path, d):
    with open(target_path, 'wt') as target_file:
        target_writer = csv.writer(target_file)
        target_writer.writerow(['name', 'count'])
        rows = d.items()
        for k, v in sorted(rows, key=lambda x: -x[1]):
           target_writer.writerow([k, v])

target_path = join(target_folder, 'person_names.csv')
save_csv(target_path, count_by_person_name)
print('person_table_path = %s' % target_path)

target_path = join(target_folder, 'location_names.csv')
save_csv(target_path, count_by_location_name)
print('location_table_path = %s' % target_path)

target_path = join(target_folder, 'organization_names.csv')
save_csv(target_path, count_by_organization_name)
print('organization_table_path = %s' % target_path)
person_table_path = /tmp/person_names.csv
location_table_path = /tmp/location_names.csv
organization_table_path = /tmp/organization_names.csv

{ person_table : People }

{ location_table : Locations }

{ organization_table : Organizations }