Test Hypotheses on Human Trafficking




Pay Notebook Creator: Roy Hyunjin Han0
Set Container: Numerical CPU with TINY Memory for 10 Minutes 0
Total0
In [9]:
# CrossCompute
search_query = 'sex trafficker'
target_folder = '/tmp'
In [10]:
"""
import simplejson as json
import time
import webhoseio
webhoseio.config(token='901881ff-1e8a-4631-b6f0-9103d71c00ba')

d = webhoseio.query('filterWebContent', {
    'q': 'sex trafficker',
    'ts': '1507927474389',
    'sort': 'relevancy',    
})
json.dump(d, open('d.json', 'wt'))
# time.sleep(1)
# d = webhoseio.get_next()
""";
In [11]:
import simplejson as json
d = json.load(open('d.json'))
In [12]:
import spacy
nlp = spacy.load('en_core_web_lg')
In [13]:
from collections import Counter

def get_most_frequent_value(xs):
    return sorted(Counter(xs).items(), key=lambda x: -x[1])[0][0]

get_most_frequent_value('apple')
Out[13]:
'p'
In [17]:
rows = []
person_names = []
place_names = []
for post in d['posts']:
    text = post['text']
    document = nlp(text)
    for x in document.ents:
        x_label = x.label_
        x_text = x.text
        if x_label == 'PERSON':
            person_names.append(x_text)
        elif x_label == 'GPE':
            place_names.append(x_text)
    person_name = get_most_frequent_value(person_names)
    place_name = get_most_frequent_value(place_names)
    rows.append([post['published'], person_name, place_name])
In [18]:
from os.path import join
from pandas import DataFrame
target_path = join(target_folder, 'incidents.csv')
DataFrame(rows).to_csv(target_path, index=False)
print('a_table_path = %s' % target_path)
a_table_path = /tmp/incidents.csv