We presented this tool and notebook as part of our workshop on Computational Approaches to Fight Human Trafficking. Thanks to Aida Shoydokova for writing much of the original code.
# CrossCompute
usa_doj_press_release_table_path = (
'human-trafficking-usa-doj-20171111-1730-sample-30.csv')
keyword_prefix = 'traffick'
category_text_path = 'human-trafficking-categories.txt'
target_folder = '/tmp'
try:
import nltk
except ImportError:
import pip
pip.main(['install', 'nltk'])
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
import pandas as pd
t = pd.read_csv(usa_doj_press_release_table_path)
print('document_count = %s' % len(t))
categories = [x.lower() for x in open(
category_text_path).read().splitlines()]
categories
import re
PATTERN_KEYWORD = re.compile(r'human\s+traffick', re.IGNORECASE)
def has_right_topic(topic_names):
if pd.isnull(topic_names):
return False
for topic_name in topic_names.split(';'):
if PATTERN_KEYWORD.search(topic_name):
return True
return False
t['has_right_topic'] = t['topic_names'].map(has_right_topic)
# Count the number of documents with the right topic
print('matching_topic_document_count = %s' % t[
'has_right_topic'].sum())
def has_empty_topic_but_right_text(row):
topic_names = row['topic_names']
if not pd.isnull(topic_names):
return False
title = row['title']
if not pd.isnull(title) and PATTERN_KEYWORD.search(title):
return True
body = row['body']
if not pd.isnull(body) and PATTERN_KEYWORD.search(body):
return True
return False
t['has_empty_topic_but_right_text'] = t.apply(
has_empty_topic_but_right_text, axis=1)
# Count the number of documents with empty topic but right text
print('empty_topic_matching_text_document_count = %s' % t[
'has_empty_topic_but_right_text'].sum())
# Count the number of human trafficking court cases
selected_t = t[
t.has_right_topic | t.has_empty_topic_but_right_text].copy()
print('human_trafficking_document_count = %s' % len(selected_t))
import re
from bs4 import BeautifulSoup
NEWLINE_PATTERN = re.compile(r'\n+', re.MULTILINE)
def get_body_text(x):
if not hasattr(x, 'getText'):
x = BeautifulSoup(x, 'lxml')
# Extract text without tags
text = x.getText(separator=u'\n').strip()
# Replace multiple newlines with a single newline
text = NEWLINE_PATTERN.sub('\n', text)
return text
selected_t['body_text'] = selected_t['body'].map(get_body_text)
def tag_part_of_speech(text):
sentences = nltk.sent_tokenize(text)
sentences = [nltk.word_tokenize(x) for x in sentences]
sentences = [nltk.pos_tag(x) for x in sentences]
return sentences
tag_part_of_speech('Here I am.')
from itertools import combinations
def combine_matches(count_by_x):
'Combine first or last name counts with full name counts'
d = dict(count_by_x)
for x1, x2 in combinations(d.keys(), 2):
if x1 in x2:
abridged_x = x1
full_x = x2
elif x2 in x1:
abridged_x = x2
full_x = x1
else:
continue
d[full_x] += d[abridged_x]
d[abridged_x] = 0
return d
combine_matches({
'benjamin b wagner': 1,
'benjamin': 3,
'anthony w ishii': 2,
})
name_count_table = pd.read_csv(
'names-usa.csv.xz', compression='xz', index_col=0)
def get_gender(name):
if not name:
return
given_name = name.split()[0].lower()
try:
selected_table = name_count_table.loc[given_name]
except KeyError:
return
return selected_table.idxmax()
get_gender('jay leno')
from collections import defaultdict
def get_category_with_nltk(tagged_sentences):
parse = nltk.RegexpParser('WHEE: {<JJ.*>*<NN.*>+}').parse
vote_by_category = defaultdict(int)
vote_weight = 1 # Count votes less if the term appears later
for sentence in tagged_sentences:
chunk_tree = parse(sentence)
for tree in chunk_tree.subtrees():
if tree.label() != 'WHEE':
continue
terms = [x[0].lower() for x in tree.leaves()]
if not has_keyword(terms):
continue
for x in categories:
if x in terms:
vote_by_category[x] += vote_weight
vote_weight *= 0.9
if not vote_by_category:
return
return pd.Series(vote_by_category).idxmax()
def has_keyword(terms):
for x in terms:
if x.startswith(keyword_prefix):
return True
return False
get_category_with_nltk(tag_part_of_speech(
'labor trafficker'))
from collections import Counter
def extract_information_with_nltk(row):
tagged_sentences = tag_part_of_speech(row.body_text)
places, names = [], []
for sentence in tagged_sentences:
chunk_tree = nltk.ne_chunk(sentence)
for tree in chunk_tree.subtrees():
label = tree.label()
x = ' '.join(x[0] for x in tree.leaves())
x = x.lower().replace('.', '').strip()
if len(x) < 2:
continue # Skip single characters
if label == 'GPE':
places.append(x)
elif label == 'PERSON':
names.append(x)
if places:
count_by_place = combine_matches(Counter(places))
place = pd.Series(count_by_place).idxmax()
else:
place = None
if names:
count_by_name = combine_matches(Counter(names))
name = pd.Series(count_by_name).idxmax()
else:
name = None
return pd.Series({
'place': place,
'name': name,
'gender': get_gender(name),
'category': get_category_with_nltk(tagged_sentences),
})
target_path = target_folder + '/incidents.csv'
extracted_t = selected_t.apply(
extract_information_with_nltk, axis=1)
extracted_t.to_csv(target_path, index = False)
print('incident_table_path = %s' % target_path)