Explore USA Department of Justice Court Case Press Releases¶

We presented this tool and notebook as part of our workshop on Computational Approaches to Fight Human Trafficking. Thanks to Aida Shoydokova for writing much of the original code.

{ a_date : Start Date ? Specify the date of the earliest desired press release }

{ b_date : End Date ? Specify the date of the latest desired press release }

Please be sure to select a Memory Level of at least MEDIUM.

In [ ]:

# CrossCompute
a_date = '2000-01-01'
b_date = '2020-01-01'
target_folder = '/tmp'

In [ ]:

from os import environ

memory_level = int(environ.get('CROSSCOMPUTE_MEMORY_LEVEL'))
if memory_level < 2:
    print(
        'memory_level.error = memory level should be set to MEDIUM '
        'or higher in order to analyze the entire dataset')

Load Dataset¶

In [ ]:

%matplotlib inline
import pandas as pd

if memory_level < 2:
    url = (
        'https://www.dropbox.com/s/nx1b6afuze8ibfg/'
        'human-trafficking-usa-doj-20171111-1730-'
        'sample-30.csv.xz?dl=1')
else:    
    url = (
        'https://www.dropbox.com/s/zr1tem2w4w1ocjz/'
        'human-trafficking-usa-doj-20171111-1730.csv.xz?dl=1')
    
raw_t = pd.read_csv(url, compression='xz')
document_count = len(raw_t)
print('raw_document_count = %s' % document_count)

In [ ]:

import pip
pip.main(['install', 'arrow'])
import arrow

In [ ]:

a_time = arrow.get(a_date).timestamp
b_time = arrow.get(b_date).timestamp

Examine a Row¶

In [ ]:

# Examine a court case press release
raw_t.iloc[0]

In [ ]:

# Get the body of the first court case
# Examine the first 500 characters
print(raw_t.iloc[0]['body'][:500])

Get Timeframe¶

In [ ]:

# Get the first publication time of the court case press release
raw_t.published_time[0]

In [ ]:

from datetime import datetime

# Convert epoch time into a date
datetime.fromtimestamp(raw_t.published_time[0]).date()

In [ ]:

# Define a function to convert epoch time into a date
def get_date(x):
    return datetime.fromtimestamp(x).date()

get_date(raw_t.published_time[0])

In [ ]:

# Get earliest date
get_date(raw_t.published_time.min())

In [ ]:

# Get latest date
get_date(raw_t.published_time.max())

Filter Rows by Time¶

In [ ]:

t = raw_t[(
    raw_t.published_time >= a_time
) & (
    raw_t.published_time <= b_time)].copy()
print('filtered_document_count = %s' % len(t))

Identify Missing Data¶

In [ ]:

# Get number of documents without topics
len(t[pd.isnull(t['topic_names'])])

In [ ]:

# Get number of documents with topics
labelled_document_count = len(t[~pd.isnull(t['topic_names'])])
labelled_document_count

In [ ]:

# Get number of documents with topics using dropna
len(t['topic_names'].dropna())

In [ ]:

# Compute percentage of documents that have topics
labelled_document_fraction = labelled_document_count / document_count
labelled_document_fraction

In [ ]:

'{:,} documents'.format(document_count)

In [ ]:

'{:.0f}% labelled'.format(100 * labelled_document_fraction)

In [ ]:

# Count the number of documents with a body
len(t['body'].dropna())

In [ ]:

# Count the number of documents with a body
len(t.body.dropna())

In [ ]:

# Count the number of documents with a body
t.body.count()

In [ ]:

# Count the number of documents without a body
t.body.isnull().sum()

In [ ]:

# Count the number of records without a title
t.title.isnull().sum()

In [ ]:

# Count the number of cases without a published time
t.published_time.isnull().sum()

In [ ]:

# Count the number of documents without topic_names
t.topic_names.isnull().sum()

In [ ]:

print('has_body_percent = {:.0f}%'.format(
    100 * t.body.count() / document_count))
print('has_title_percent = {:.0f}%'.format(
    100 * t.title.count() / document_count))
print('has_published_time_percent = {:.0f}%'.format(
    100 * t.published_time.count() / document_count))
print('has_topic_percent = {:.0f}%'.format(
    100 * t.topic_names.count() / document_count))

Study Distributions¶

In [ ]:

import numpy as np

def count_topics(topic_names):
    try:
        if np.isnan(topic_names):
            return 0
    except TypeError:
        return len(topic_names.split(';'))
        
# Create a column called topic_count
t['topic_count'] = t['topic_names'].map(count_topics)

In [ ]:

# Count the number of documents where topic_count is zero
(t.topic_count == 0).sum()

In [ ]:

# Get the distribution of the number of topics per document
t['topic_count'].value_counts(sort=True)

In [ ]:

# Examine a row where the topic count is more than 1
t[t.topic_count > 1].iloc[0]

In [ ]:

from collections import defaultdict

# Tally count by topic
d = defaultdict(int)
for x in t['topic_names'].dropna():
    for topic_name in x.split(';'): 
        d[topic_name.strip()] += 1
count_by_topic = pd.Series(d).sort_values(ascending=False)
count_by_topic

In [ ]:

target_path = target_folder + '/count-by-topic.csv'
count_by_topic.to_csv(target_path)
print('count_by_topic_table_path = %s' % target_path)

In [ ]:

import matplotlib.pyplot as plt

# Plot the distribution of topics by frequency
count_by_topic.plot(kind='bar')

target_path = target_folder + '/count-by-topic.png'
plt.savefig(target_path)
print('count_by_topic_image_path = %s' % target_path)

In [ ]:

# Count number of documents with a published_time
t.published_time.count()

In [ ]:

# Count number of documents with a published_time
len(t.dropna(subset=['published_time']))

In [ ]:

# Get table of documents with a published_time
v = t.dropna(subset=['published_time']).copy()
v['published_datetime'] = pd.to_datetime(
    v['published_time'].map(get_date))
v.set_index('published_datetime', inplace=True)

# Plot the frequency of empty topics by year
empty_topic_table = v[v.topic_count == 0]
empty_topic_table_by_year = empty_topic_table.resample('A')[
    'uuid'].count()
empty_topic_table_by_year.plot()

target_path = target_folder + '/empty-topic-count-by-year-line.png'
plt.savefig(target_path)
print('empty_topic_count_by_year_line_image_path = %s' % target_path)

In [ ]:

empty_topic_table_by_year.plot(kind='bar')

target_path = target_folder + '/empty-topic-count-by-year-bar.png'
plt.savefig(target_path)
print('empty_topic_count_by_year_bar_image_path = %s' % target_path)

Pay Notebook Creator: Roy Hyunjin Han	0
Set Container: Numerical CPU with TINY Memory for 10 Minutes	0
Total	0

Build a Human Trafficking Dataset from Court Cases and News Articles 20171214

Explore USA Department of Justice Court Case Press Releases¶

Load Dataset¶

Examine a Row¶

Get Timeframe¶

Filter Rows by Time¶

Identify Missing Data¶

Study Distributions¶