Build a Human Trafficking Dataset from Court Cases and News Articles 20171214




Pay Notebook Creator: Roy Hyunjin Han0
Set Container: Numerical CPU with TINY Memory for 10 Minutes 0
Total0

Normalize Raw Text

We presented this tool and notebook as part of our workshop on Computational Approaches to Fight Human Trafficking.

In [ ]:
# CrossCompute
source_text_path = 'Zbigniew Herbert - Mr Cogito Tells of the Temptation of Spinoza.txt'
target_folder = '/tmp'
In [ ]:
try:
    import nltk
except ImportError:
    import pip
    pip.main(['install', 'nltk'])
    import nltk
nltk.download('punkt')
nltk.download('wordnet')    
In [ ]:
from nltk import word_tokenize
tokens = word_tokenize(open(source_text_path).read())
tokens[:5]
In [ ]:
from os.path import join
In [ ]:
from nltk import PorterStemmer
normalization_model = PorterStemmer()
terms = [normalization_model.stem(x) for x in tokens]
target_path = join(target_folder, 'tokens-stemmed-porter.txt')
open(target_path, 'wt').write(' '.join(terms))
print('porter_stemmed_text_path = %s' % target_path)
In [ ]:
from nltk import SnowballStemmer
normalization_model = SnowballStemmer('english')
terms = [normalization_model.stem(x) for x in tokens]
target_path = join(target_folder, 'tokens-stemmed-snowball.txt')
open(target_path, 'wt').write(' '.join(terms))
print('snowball_stemmed_text_path = %s' % target_path)
In [ ]:
from nltk import WordNetLemmatizer
normalization_model = WordNetLemmatizer()
terms = [normalization_model.lemmatize(x) for x in tokens]
target_path = join(target_folder, 'tokens-lemma-wordnet.txt')
open(target_path, 'wt').write(' '.join(terms))
print('wordnet_lemmatized_text_path = %s' % target_path)