We presented this tool and notebook as part of our workshop on Computational Approaches to Fight Human Trafficking. Thanks to Aida Shoydokova for writing much of the original code.
{ a_date : Start Date ? Specify the date of the earliest desired press release }
{ b_date : End Date ? Specify the date of the latest desired press release }
Please be sure to select a Memory Level of at least MEDIUM.
# CrossCompute
a_date = '2000-01-01'
b_date = '2020-01-01'
target_folder = '/tmp'
from os import environ
memory_level = int(environ.get('CROSSCOMPUTE_MEMORY_LEVEL'))
if memory_level < 2:
print(
'memory_level.error = memory level should be set to MEDIUM '
'or higher in order to analyze the entire dataset')
%matplotlib inline
import pandas as pd
if memory_level < 2:
url = (
'https://www.dropbox.com/s/nx1b6afuze8ibfg/'
'human-trafficking-usa-doj-20171111-1730-'
'sample-30.csv.xz?dl=1')
else:
url = (
'https://www.dropbox.com/s/zr1tem2w4w1ocjz/'
'human-trafficking-usa-doj-20171111-1730.csv.xz?dl=1')
raw_t = pd.read_csv(url, compression='xz')
document_count = len(raw_t)
print('raw_document_count = %s' % document_count)
import pip
pip.main(['install', 'arrow'])
import arrow
a_time = arrow.get(a_date).timestamp
b_time = arrow.get(b_date).timestamp
# Examine a court case press release
raw_t.iloc[0]
# Get the body of the first court case
# Examine the first 500 characters
print(raw_t.iloc[0]['body'][:500])
# Get the first publication time of the court case press release
raw_t.published_time[0]
from datetime import datetime
# Convert epoch time into a date
datetime.fromtimestamp(raw_t.published_time[0]).date()
# Define a function to convert epoch time into a date
def get_date(x):
return datetime.fromtimestamp(x).date()
get_date(raw_t.published_time[0])
# Get earliest date
get_date(raw_t.published_time.min())
# Get latest date
get_date(raw_t.published_time.max())
t = raw_t[(
raw_t.published_time >= a_time
) & (
raw_t.published_time <= b_time)].copy()
print('filtered_document_count = %s' % len(t))
# Get number of documents without topics
len(t[pd.isnull(t['topic_names'])])
# Get number of documents with topics
labelled_document_count = len(t[~pd.isnull(t['topic_names'])])
labelled_document_count
# Get number of documents with topics using dropna
len(t['topic_names'].dropna())
# Compute percentage of documents that have topics
labelled_document_fraction = labelled_document_count / document_count
labelled_document_fraction
'{:,} documents'.format(document_count)
'{:.0f}% labelled'.format(100 * labelled_document_fraction)
# Count the number of documents with a body
len(t['body'].dropna())
# Count the number of documents with a body
len(t.body.dropna())
# Count the number of documents with a body
t.body.count()
# Count the number of documents without a body
t.body.isnull().sum()
# Count the number of records without a title
t.title.isnull().sum()
# Count the number of cases without a published time
t.published_time.isnull().sum()
# Count the number of documents without topic_names
t.topic_names.isnull().sum()
print('has_body_percent = {:.0f}%'.format(
100 * t.body.count() / document_count))
print('has_title_percent = {:.0f}%'.format(
100 * t.title.count() / document_count))
print('has_published_time_percent = {:.0f}%'.format(
100 * t.published_time.count() / document_count))
print('has_topic_percent = {:.0f}%'.format(
100 * t.topic_names.count() / document_count))
import numpy as np
def count_topics(topic_names):
try:
if np.isnan(topic_names):
return 0
except TypeError:
return len(topic_names.split(';'))
# Create a column called topic_count
t['topic_count'] = t['topic_names'].map(count_topics)
# Count the number of documents where topic_count is zero
(t.topic_count == 0).sum()
# Get the distribution of the number of topics per document
t['topic_count'].value_counts(sort=True)
# Examine a row where the topic count is more than 1
t[t.topic_count > 1].iloc[0]
from collections import defaultdict
# Tally count by topic
d = defaultdict(int)
for x in t['topic_names'].dropna():
for topic_name in x.split(';'):
d[topic_name.strip()] += 1
count_by_topic = pd.Series(d).sort_values(ascending=False)
count_by_topic
target_path = target_folder + '/count-by-topic.csv'
count_by_topic.to_csv(target_path)
print('count_by_topic_table_path = %s' % target_path)
import matplotlib.pyplot as plt
# Plot the distribution of topics by frequency
count_by_topic.plot(kind='bar')
target_path = target_folder + '/count-by-topic.png'
plt.savefig(target_path)
print('count_by_topic_image_path = %s' % target_path)
# Count number of documents with a published_time
t.published_time.count()
# Count number of documents with a published_time
len(t.dropna(subset=['published_time']))
# Get table of documents with a published_time
v = t.dropna(subset=['published_time']).copy()
v['published_datetime'] = pd.to_datetime(
v['published_time'].map(get_date))
v.set_index('published_datetime', inplace=True)
# Plot the frequency of empty topics by year
empty_topic_table = v[v.topic_count == 0]
empty_topic_table_by_year = empty_topic_table.resample('A')[
'uuid'].count()
empty_topic_table_by_year.plot()
target_path = target_folder + '/empty-topic-count-by-year-line.png'
plt.savefig(target_path)
print('empty_topic_count_by_year_line_image_path = %s' % target_path)
empty_topic_table_by_year.plot(kind='bar')
target_path = target_folder + '/empty-topic-count-by-year-bar.png'
plt.savefig(target_path)
print('empty_topic_count_by_year_bar_image_path = %s' % target_path)