Build a Human Trafficking Dataset from Court Cases and News Articles 20171214

Pay Notebook Creator: Roy Hyunjin Han0
Set Container: Numerical CPU with TINY Memory for 10 Minutes 0

Extract a Table of Human Trafficking Incidents from a Table of USA DOJ Court Case Press Releases using spaCy

We presented this tool and notebook as part of our workshop on Computational Approaches to Fight Human Trafficking. Thanks to Aida Shoydokova for writing much of the original code.

  • The environment level must be set to COMPUTATIONAL in order for this code to run.
  • The memory level should be set to LARGE or higher to use spacy's higher accuracy models.
  • This code may take a long time to run. We recommend reserving an execution time of at least 10 minutes, depending on the size of your dataset.
In [ ]:
# CrossCompute
usa_doj_press_release_table_path = (
keyword_prefix = 'traffick'
category_text_path = 'human-trafficking-categories.txt'
target_folder = '/tmp'
In [ ]:
from os import environ

environment_level = int(environ.get(
memory_level = int(environ.get(

if environment_level < 1:
        'environment_level.error = environment level must be set to '
        'computational in order to use the spacy package because it '
        'takes too long to install')

if memory_level < 3:
        'memory_level.error = memory level should be set to large '
        'or higher in order to use the en_core_web_lg spacy model')
In [ ]:
import spacy

if memory_level < 3:
    spacy_model = 'en'
    spacy_model = 'en_core_web_lg'
print('spacy_model = %s' % spacy_model)
nlp = spacy.load(spacy_model)
In [ ]:
import pandas as pd

t = pd.read_csv(usa_doj_press_release_table_path)
print('document_count = %s' % len(t))
In [ ]:
categories = [x.lower() for x in open(
In [ ]:
import re

PATTERN_KEYWORD = re.compile(r'human\s+traffick', re.IGNORECASE)

def has_right_topic(topic_names):
    if pd.isnull(topic_names):
        return False
    for topic_name in topic_names.split(';'):
            return True
    return False

t['has_right_topic'] = t['topic_names'].map(has_right_topic)
In [ ]:
# Count the number of documents with the right topic
print('matching_topic_document_count = %s' % t[
In [ ]:
def has_empty_topic_but_right_text(row):
    topic_names = row['topic_names']
    if not pd.isnull(topic_names):
        return False
    title = row['title']
    if not pd.isnull(title) and
        return True
    body = row['body']
    if not pd.isnull(body) and
        return True
    return False
t['has_empty_topic_but_right_text'] = t.apply(
    has_empty_topic_but_right_text, axis=1)
In [ ]:
# Count the number of documents with empty topic but right text
print('empty_topic_matching_text_document_count = %s' % t[
In [ ]:
# Count the number of human trafficking court cases
selected_t = t[
    t.has_right_topic | t.has_empty_topic_but_right_text].copy()
print('human_trafficking_document_count = %s' % len(selected_t))

Normalize Text

In [ ]:
import re
from bs4 import BeautifulSoup

NEWLINE_PATTERN = re.compile(r'\n+', re.MULTILINE)

def get_body_text(x):
    if not hasattr(x, 'getText'):
        x = BeautifulSoup(x, 'lxml')
    # Extract text without tags
    text = x.getText(separator=u'\n').strip()
    # Replace multiple newlines with a single newline
    text = NEWLINE_PATTERN.sub('\n', text)
    return text

selected_t['body_text'] = selected_t['body'].map(get_body_text)

Extract Information

In [ ]:
from itertools import combinations

def combine_matches(count_by_x):
    'Combine first or last name counts with full name counts'
    d = dict(count_by_x)
    for x1, x2 in combinations(d.keys(), 2):
        if x1 in x2:
            abridged_x = x1
            full_x = x2
        elif x2 in x1:
            abridged_x = x2
            full_x = x1    
        d[full_x] += d[abridged_x]
        d[abridged_x] = 0
    return d

    'benjamin b wagner': 1,
    'benjamin': 3,
    'anthony w ishii': 2,
In [ ]:
name_count_table = pd.read_csv(
    'names-usa.csv.xz', compression='xz', index_col=0)

def get_gender(name):
    if not name:
    given_name = name.split()[0].lower()
        selected_table = name_count_table.loc[given_name]
    except KeyError:
    return selected_table.idxmax()

get_gender('jay leno')
In [ ]:
from collections import defaultdict

def get_category_with_spacy(spacy_doc):
    vote_by_category = defaultdict(int)
    vote_weight = 1  # Count votes less if the term appears later
    for chunk in spacy_doc.noun_chunks:
        # Ignore chunks whose root does not include our keyword
        if not chunk.root.text.lower().startswith(keyword_prefix):
        for x in categories:
            if x in chunk.text.split():
                vote_by_category[x] += vote_weight
                vote_weight *= 0.9
    if not vote_by_category:
    return pd.Series(vote_by_category).idxmax()   
In [ ]:
from collections import Counter

def extract_information_with_spacy(row):
    spacy_doc = nlp(row.body_text)
    places, names = [], []
    for entity in spacy_doc.ents:
        label = entity.label_
        x = entity.text
        x = x.lower().replace('.', '').strip()
        if len(x) < 2:
            continue  # Skip single characters
        if label == 'GPE':
        elif label == 'PERSON':
    if places:
        count_by_place = combine_matches(Counter(places))
        place = pd.Series(count_by_place).idxmax()
        place = None
    if names:
        count_by_name = combine_matches(Counter(names))
        name = pd.Series(count_by_name).idxmax()
        name = None
    return pd.Series({
        'place': place,
        'name': name,
        'gender': get_gender(name),
        'category': get_category_with_spacy(spacy_doc),
In [ ]:
target_path = target_folder + '/incidents.csv'
extracted_t = selected_t.apply(
    extract_information_with_spacy, axis=1)
extracted_t.to_csv(target_path, index = False)
print('incident_table_path = %s' % target_path)