# CrossCompute
search_query = 'sex trafficker'
target_folder = '/tmp'
"""
import simplejson as json
import time
import webhoseio
webhoseio.config(token='901881ff-1e8a-4631-b6f0-9103d71c00ba')
d = webhoseio.query('filterWebContent', {
'q': 'sex trafficker',
'ts': '1507927474389',
'sort': 'relevancy',
})
json.dump(d, open('d.json', 'wt'))
# time.sleep(1)
# d = webhoseio.get_next()
""";
import simplejson as json
d = json.load(open('d.json'))
import spacy
nlp = spacy.load('en_core_web_lg')
from collections import Counter
def get_most_frequent_value(xs):
return sorted(Counter(xs).items(), key=lambda x: -x[1])[0][0]
get_most_frequent_value('apple')
rows = []
person_names = []
place_names = []
for post in d['posts']:
text = post['text']
document = nlp(text)
for x in document.ents:
x_label = x.label_
x_text = x.text
if x_label == 'PERSON':
person_names.append(x_text)
elif x_label == 'GPE':
place_names.append(x_text)
person_name = get_most_frequent_value(person_names)
place_name = get_most_frequent_value(place_names)
rows.append([post['published'], person_name, place_name])
from os.path import join
from pandas import DataFrame
target_path = join(target_folder, 'incidents.csv')
DataFrame(rows).to_csv(target_path, index=False)
print('a_table_path = %s' % target_path)