We presented this tool and notebook as part of our workshop on Computational Approaches to Fight Human Trafficking.
# CrossCompute
source_text_path = 'Zbigniew Herbert - Mr Cogito Tells of the Temptation of Spinoza.txt'
target_folder = '/tmp'
try:
import nltk
except ImportError:
import pip
pip.main(['install', 'nltk'])
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk import word_tokenize
tokens = word_tokenize(open(source_text_path).read())
tokens[:5]
from os.path import join
from nltk import PorterStemmer
normalization_model = PorterStemmer()
terms = [normalization_model.stem(x) for x in tokens]
target_path = join(target_folder, 'tokens-stemmed-porter.txt')
open(target_path, 'wt').write(' '.join(terms))
print('porter_stemmed_text_path = %s' % target_path)
from nltk import SnowballStemmer
normalization_model = SnowballStemmer('english')
terms = [normalization_model.stem(x) for x in tokens]
target_path = join(target_folder, 'tokens-stemmed-snowball.txt')
open(target_path, 'wt').write(' '.join(terms))
print('snowball_stemmed_text_path = %s' % target_path)
from nltk import WordNetLemmatizer
normalization_model = WordNetLemmatizer()
terms = [normalization_model.lemmatize(x) for x in tokens]
target_path = join(target_folder, 'tokens-lemma-wordnet.txt')
open(target_path, 'wt').write(' '.join(terms))
print('wordnet_lemmatized_text_path = %s' % target_path)