Build a Human Trafficking Dataset from Court Cases and News Articles 20171214

Pay Notebook Creator: Roy Hyunjin Han0
Set Container: Numerical CPU with TINY Memory for 10 Minutes 0

Extract a Table of Human Trafficking Incidents from a Table of USA DOJ Court Case Press Releases using NLTK

We presented this tool and notebook as part of our workshop on Computational Approaches to Fight Human Trafficking. Thanks to Aida Shoydokova for writing much of the original code.

  • This code may take a long time to run. We recommend reserving an execution time of at least 10 minutes, depending on the size of your dataset.
In [ ]:
# CrossCompute
usa_doj_press_release_table_path = (
keyword_prefix = 'traffick'
category_text_path = 'human-trafficking-categories.txt'
target_folder = '/tmp'
In [ ]:
    import nltk
except ImportError:
    import pip
    pip.main(['install', 'nltk'])
    import nltk'punkt')'averaged_perceptron_tagger')'maxent_ne_chunker')'words')
In [ ]:
import pandas as pd

t = pd.read_csv(usa_doj_press_release_table_path)
print('document_count = %s' % len(t))
In [ ]:
categories = [x.lower() for x in open(
In [ ]:
import re

PATTERN_KEYWORD = re.compile(r'human\s+traffick', re.IGNORECASE)

def has_right_topic(topic_names):
    if pd.isnull(topic_names):
        return False
    for topic_name in topic_names.split(';'):
            return True
    return False

t['has_right_topic'] = t['topic_names'].map(has_right_topic)
In [ ]:
# Count the number of documents with the right topic
print('matching_topic_document_count = %s' % t[
In [ ]:
def has_empty_topic_but_right_text(row):
    topic_names = row['topic_names']
    if not pd.isnull(topic_names):
        return False
    title = row['title']
    if not pd.isnull(title) and
        return True
    body = row['body']
    if not pd.isnull(body) and
        return True
    return False
t['has_empty_topic_but_right_text'] = t.apply(
    has_empty_topic_but_right_text, axis=1)
In [ ]:
# Count the number of documents with empty topic but right text
print('empty_topic_matching_text_document_count = %s' % t[
In [ ]:
# Count the number of human trafficking court cases
selected_t = t[
    t.has_right_topic | t.has_empty_topic_but_right_text].copy()
print('human_trafficking_document_count = %s' % len(selected_t))

Normalize Text

In [ ]:
import re
from bs4 import BeautifulSoup

NEWLINE_PATTERN = re.compile(r'\n+', re.MULTILINE)

def get_body_text(x):
    if not hasattr(x, 'getText'):
        x = BeautifulSoup(x, 'lxml')
    # Extract text without tags
    text = x.getText(separator=u'\n').strip()
    # Replace multiple newlines with a single newline
    text = NEWLINE_PATTERN.sub('\n', text)
    return text

selected_t['body_text'] = selected_t['body'].map(get_body_text)

Extract Information

In [ ]:
def tag_part_of_speech(text):
    sentences = nltk.sent_tokenize(text)
    sentences = [nltk.word_tokenize(x) for x in sentences]
    sentences = [nltk.pos_tag(x) for x in sentences]
    return sentences

tag_part_of_speech('Here I am.')
In [ ]:
from itertools import combinations

def combine_matches(count_by_x):
    'Combine first or last name counts with full name counts'
    d = dict(count_by_x)
    for x1, x2 in combinations(d.keys(), 2):
        if x1 in x2:
            abridged_x = x1
            full_x = x2
        elif x2 in x1:
            abridged_x = x2
            full_x = x1    
        d[full_x] += d[abridged_x]
        d[abridged_x] = 0
    return d

    'benjamin b wagner': 1,
    'benjamin': 3,
    'anthony w ishii': 2,
In [ ]:
name_count_table = pd.read_csv(
    'names-usa.csv.xz', compression='xz', index_col=0)

def get_gender(name):
    if not name:
    given_name = name.split()[0].lower()
        selected_table = name_count_table.loc[given_name]
    except KeyError:
    return selected_table.idxmax()

get_gender('jay leno')
In [ ]:
from collections import defaultdict

def get_category_with_nltk(tagged_sentences):
    parse = nltk.RegexpParser('WHEE: {<JJ.*>*<NN.*>+}').parse
    vote_by_category = defaultdict(int)
    vote_weight = 1  # Count votes less if the term appears later
    for sentence in tagged_sentences:
        chunk_tree = parse(sentence)
        for tree in chunk_tree.subtrees():
            if tree.label() != 'WHEE':
            terms = [x[0].lower() for x in tree.leaves()]
            if not has_keyword(terms):
            for x in categories:
                if x in terms:
                    vote_by_category[x] += vote_weight
                    vote_weight *= 0.9
    if not vote_by_category:
    return pd.Series(vote_by_category).idxmax()

def has_keyword(terms):
    for x in terms:
        if x.startswith(keyword_prefix):
            return True
    return False

    'labor trafficker'))
In [ ]:
from collections import Counter

def extract_information_with_nltk(row):
    tagged_sentences = tag_part_of_speech(row.body_text)
    places, names = [], []
    for sentence in tagged_sentences:
        chunk_tree = nltk.ne_chunk(sentence)
        for tree in chunk_tree.subtrees():
            label = tree.label()
            x = ' '.join(x[0] for x in tree.leaves())
            x = x.lower().replace('.', '').strip()
            if len(x) < 2:
                continue  # Skip single characters
            if label == 'GPE':
            elif label == 'PERSON':
    if places:
        count_by_place = combine_matches(Counter(places))
        place = pd.Series(count_by_place).idxmax()
        place = None
    if names:
        count_by_name = combine_matches(Counter(names))
        name = pd.Series(count_by_name).idxmax()
        name = None
    return pd.Series({
        'place': place,
        'name': name,
        'gender': get_gender(name),
        'category': get_category_with_nltk(tagged_sentences),
In [ ]:
target_path = target_folder + '/incidents.csv'
extracted_t = selected_t.apply(
    extract_information_with_nltk, axis=1)
extracted_t.to_csv(target_path, index = False)
print('incident_table_path = %s' % target_path)