We presented this tool and notebook as part of our workshop on Computational Approaches to Fight Human Trafficking. Thanks to Aida Shoydokova for writing much of the original code.
# CrossCompute
usa_doj_press_release_table_path = (
'human-trafficking-usa-doj-20171111-1730-sample-30.csv')
keyword_prefix = 'traffick'
category_text_path = 'human-trafficking-categories.txt'
target_folder = '/tmp'
from os import environ
environment_level = int(environ.get(
'CROSSCOMPUTE_ENVIRONMENT_LEVEL', 0))
memory_level = int(environ.get(
'CROSSCOMPUTE_MEMORY_LEVEL'))
if environment_level < 1:
print(
'environment_level.error = environment level must be set to '
'computational in order to use the spacy package because it '
'takes too long to install')
if memory_level < 3:
print(
'memory_level.error = memory level should be set to large '
'or higher in order to use the en_core_web_lg spacy model')
import spacy
if memory_level < 3:
spacy_model = 'en'
else:
spacy_model = 'en_core_web_lg'
print('spacy_model = %s' % spacy_model)
nlp = spacy.load(spacy_model)
import pandas as pd
t = pd.read_csv(usa_doj_press_release_table_path)
print('document_count = %s' % len(t))
categories = [x.lower() for x in open(
category_text_path).read().splitlines()]
categories
import re
PATTERN_KEYWORD = re.compile(r'human\s+traffick', re.IGNORECASE)
def has_right_topic(topic_names):
if pd.isnull(topic_names):
return False
for topic_name in topic_names.split(';'):
if PATTERN_KEYWORD.search(topic_name):
return True
return False
t['has_right_topic'] = t['topic_names'].map(has_right_topic)
# Count the number of documents with the right topic
print('matching_topic_document_count = %s' % t[
'has_right_topic'].sum())
def has_empty_topic_but_right_text(row):
topic_names = row['topic_names']
if not pd.isnull(topic_names):
return False
title = row['title']
if not pd.isnull(title) and PATTERN_KEYWORD.search(title):
return True
body = row['body']
if not pd.isnull(body) and PATTERN_KEYWORD.search(body):
return True
return False
t['has_empty_topic_but_right_text'] = t.apply(
has_empty_topic_but_right_text, axis=1)
# Count the number of documents with empty topic but right text
print('empty_topic_matching_text_document_count = %s' % t[
'has_empty_topic_but_right_text'].sum())
# Count the number of human trafficking court cases
selected_t = t[
t.has_right_topic | t.has_empty_topic_but_right_text].copy()
print('human_trafficking_document_count = %s' % len(selected_t))
import re
from bs4 import BeautifulSoup
NEWLINE_PATTERN = re.compile(r'\n+', re.MULTILINE)
def get_body_text(x):
if not hasattr(x, 'getText'):
x = BeautifulSoup(x, 'lxml')
# Extract text without tags
text = x.getText(separator=u'\n').strip()
# Replace multiple newlines with a single newline
text = NEWLINE_PATTERN.sub('\n', text)
return text
selected_t['body_text'] = selected_t['body'].map(get_body_text)
from itertools import combinations
def combine_matches(count_by_x):
'Combine first or last name counts with full name counts'
d = dict(count_by_x)
for x1, x2 in combinations(d.keys(), 2):
if x1 in x2:
abridged_x = x1
full_x = x2
elif x2 in x1:
abridged_x = x2
full_x = x1
else:
continue
d[full_x] += d[abridged_x]
d[abridged_x] = 0
return d
combine_matches({
'benjamin b wagner': 1,
'benjamin': 3,
'anthony w ishii': 2,
})
name_count_table = pd.read_csv(
'names-usa.csv.xz', compression='xz', index_col=0)
def get_gender(name):
if not name:
return
given_name = name.split()[0].lower()
try:
selected_table = name_count_table.loc[given_name]
except KeyError:
return
return selected_table.idxmax()
get_gender('jay leno')
from collections import defaultdict
def get_category_with_spacy(spacy_doc):
vote_by_category = defaultdict(int)
vote_weight = 1 # Count votes less if the term appears later
for chunk in spacy_doc.noun_chunks:
# Ignore chunks whose root does not include our keyword
if not chunk.root.text.lower().startswith(keyword_prefix):
continue
for x in categories:
if x in chunk.text.split():
vote_by_category[x] += vote_weight
vote_weight *= 0.9
if not vote_by_category:
return
return pd.Series(vote_by_category).idxmax()
from collections import Counter
def extract_information_with_spacy(row):
spacy_doc = nlp(row.body_text)
places, names = [], []
for entity in spacy_doc.ents:
label = entity.label_
x = entity.text
x = x.lower().replace('.', '').strip()
if len(x) < 2:
continue # Skip single characters
if label == 'GPE':
places.append(x)
elif label == 'PERSON':
names.append(x)
if places:
count_by_place = combine_matches(Counter(places))
place = pd.Series(count_by_place).idxmax()
else:
place = None
if names:
count_by_name = combine_matches(Counter(names))
name = pd.Series(count_by_name).idxmax()
else:
name = None
return pd.Series({
'place': place,
'name': name,
'gender': get_gender(name),
'category': get_category_with_spacy(spacy_doc),
})
target_path = target_folder + '/incidents.csv'
extracted_t = selected_t.apply(
extract_information_with_spacy, axis=1)
extracted_t.to_csv(target_path, index = False)
print('incident_table_path = %s' % target_path)