Build a Human Trafficking Dataset from Court Cases and News Articles 20171214




Pay Notebook Creator: Roy Hyunjin Han0
Set Container: Numerical CPU with TINY Memory for 10 Minutes 0
Total0

Explore USA Department of Justice Court Case Press Releases

We presented this tool and notebook as part of our workshop on Computational Approaches to Fight Human Trafficking. Thanks to Aida Shoydokova for writing much of the original code.

{ a_date : Start Date ? Specify the date of the earliest desired press release }

{ b_date : End Date ? Specify the date of the latest desired press release }

Please be sure to select a Memory Level of at least MEDIUM.

In [ ]:
# CrossCompute
a_date = '2000-01-01'
b_date = '2020-01-01'
target_folder = '/tmp'
In [ ]:
from os import environ

memory_level = int(environ.get('CROSSCOMPUTE_MEMORY_LEVEL'))
if memory_level < 2:
    print(
        'memory_level.error = memory level should be set to MEDIUM '
        'or higher in order to analyze the entire dataset')

Load Dataset

In [ ]:
%matplotlib inline
import pandas as pd

if memory_level < 2:
    url = (
        'https://www.dropbox.com/s/nx1b6afuze8ibfg/'
        'human-trafficking-usa-doj-20171111-1730-'
        'sample-30.csv.xz?dl=1')
else:    
    url = (
        'https://www.dropbox.com/s/zr1tem2w4w1ocjz/'
        'human-trafficking-usa-doj-20171111-1730.csv.xz?dl=1')
    
raw_t = pd.read_csv(url, compression='xz')
document_count = len(raw_t)
print('raw_document_count = %s' % document_count)
In [ ]:
import pip
pip.main(['install', 'arrow'])
import arrow
In [ ]:
a_time = arrow.get(a_date).timestamp
b_time = arrow.get(b_date).timestamp

Examine a Row

In [ ]:
# Examine a court case press release
raw_t.iloc[0]
In [ ]:
# Get the body of the first court case
# Examine the first 500 characters
print(raw_t.iloc[0]['body'][:500])

Get Timeframe

In [ ]:
# Get the first publication time of the court case press release
raw_t.published_time[0]
In [ ]:
from datetime import datetime

# Convert epoch time into a date
datetime.fromtimestamp(raw_t.published_time[0]).date()
In [ ]:
# Define a function to convert epoch time into a date
def get_date(x):
    return datetime.fromtimestamp(x).date()

get_date(raw_t.published_time[0])
In [ ]:
# Get earliest date
get_date(raw_t.published_time.min())
In [ ]:
# Get latest date
get_date(raw_t.published_time.max())

Filter Rows by Time

In [ ]:
t = raw_t[(
    raw_t.published_time >= a_time
) & (
    raw_t.published_time <= b_time)].copy()
print('filtered_document_count = %s' % len(t))

Identify Missing Data

In [ ]:
# Get number of documents without topics
len(t[pd.isnull(t['topic_names'])])
In [ ]:
# Get number of documents with topics
labelled_document_count = len(t[~pd.isnull(t['topic_names'])])
labelled_document_count
In [ ]:
# Get number of documents with topics using dropna
len(t['topic_names'].dropna())
In [ ]:
# Compute percentage of documents that have topics
labelled_document_fraction = labelled_document_count / document_count
labelled_document_fraction
In [ ]:
'{:,} documents'.format(document_count)
In [ ]:
'{:.0f}% labelled'.format(100 * labelled_document_fraction)
In [ ]:
# Count the number of documents with a body
len(t['body'].dropna())
In [ ]:
# Count the number of documents with a body
len(t.body.dropna())
In [ ]:
# Count the number of documents with a body
t.body.count()
In [ ]:
# Count the number of documents without a body
t.body.isnull().sum()
In [ ]:
# Count the number of records without a title
t.title.isnull().sum()
In [ ]:
# Count the number of cases without a published time
t.published_time.isnull().sum()
In [ ]:
# Count the number of documents without topic_names
t.topic_names.isnull().sum()
In [ ]:
print('has_body_percent = {:.0f}%'.format(
    100 * t.body.count() / document_count))
print('has_title_percent = {:.0f}%'.format(
    100 * t.title.count() / document_count))
print('has_published_time_percent = {:.0f}%'.format(
    100 * t.published_time.count() / document_count))
print('has_topic_percent = {:.0f}%'.format(
    100 * t.topic_names.count() / document_count))

Study Distributions

In [ ]:
import numpy as np

def count_topics(topic_names):
    try:
        if np.isnan(topic_names):
            return 0
    except TypeError:
        return len(topic_names.split(';'))
        
# Create a column called topic_count
t['topic_count'] = t['topic_names'].map(count_topics)
In [ ]:
# Count the number of documents where topic_count is zero
(t.topic_count == 0).sum()
In [ ]:
# Get the distribution of the number of topics per document
t['topic_count'].value_counts(sort=True)
In [ ]:
# Examine a row where the topic count is more than 1
t[t.topic_count > 1].iloc[0]
In [ ]:
from collections import defaultdict

# Tally count by topic
d = defaultdict(int)
for x in t['topic_names'].dropna():
    for topic_name in x.split(';'): 
        d[topic_name.strip()] += 1
count_by_topic = pd.Series(d).sort_values(ascending=False)
count_by_topic
In [ ]:
target_path = target_folder + '/count-by-topic.csv'
count_by_topic.to_csv(target_path)
print('count_by_topic_table_path = %s' % target_path)
In [ ]:
import matplotlib.pyplot as plt

# Plot the distribution of topics by frequency
count_by_topic.plot(kind='bar')

target_path = target_folder + '/count-by-topic.png'
plt.savefig(target_path)
print('count_by_topic_image_path = %s' % target_path)
In [ ]:
# Count number of documents with a published_time
t.published_time.count()
In [ ]:
# Count number of documents with a published_time
len(t.dropna(subset=['published_time']))
In [ ]:
# Get table of documents with a published_time
v = t.dropna(subset=['published_time']).copy()
v['published_datetime'] = pd.to_datetime(
    v['published_time'].map(get_date))
v.set_index('published_datetime', inplace=True)

# Plot the frequency of empty topics by year
empty_topic_table = v[v.topic_count == 0]
empty_topic_table_by_year = empty_topic_table.resample('A')[
    'uuid'].count()
empty_topic_table_by_year.plot()

target_path = target_folder + '/empty-topic-count-by-year-line.png'
plt.savefig(target_path)
print('empty_topic_count_by_year_line_image_path = %s' % target_path)
In [ ]:
empty_topic_table_by_year.plot(kind='bar')

target_path = target_folder + '/empty-topic-count-by-year-bar.png'
plt.savefig(target_path)
print('empty_topic_count_by_year_bar_image_path = %s' % target_path)