NLP




Pay Notebook Creator: Salah Ahmed0
Set Container: Numerical CPU with TINY Memory for 10 Minutes 0
Total0

word frequency

In [46]:
# CrossCompute
words_text_path = 'wheniheard.txt'
target_folder = '.'
In [4]:
with open(words_text_path) as f:
    text = f.read().strip()
In [5]:
text
Out[5]:
"When I heard the learn'd astronomer;\t \nWhen the proofs, the figures, were ranged in columns before me;\t \nWhen I was shown the charts and the diagrams, to add, divide,  and measure them;\t \nWhen I, sitting, heard the astronomer, where he lectured with  much applause in the lecture-room,\t \nHow soon, unaccountable, I became tired and sick;\nTill rising and gliding out, I wander'd off by myself,\t \nIn the mystical moist night-air, and from time to time,\t \nLook'd up in perfect silence at the stars."
In [30]:
import matplotlib.pyplot as plt
from os.path import join
from pandas import DataFrame
from string import punctuation
try:
    import nltk
except ImportError:
    import sys
    !{sys.executable} -m pip install nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
[nltk_data] Downloading package stopwords to /home/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
In [31]:
sw = stopwords.words('english') + list(punctuation)
sw
Out[31]:
['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'her',
 'hers',
 'herself',
 'it',
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'no',
 'nor',
 'not',
 'only',
 'own',
 'same',
 'so',
 'than',
 'too',
 'very',
 's',
 't',
 'can',
 'will',
 'just',
 'don',
 'should',
 'now',
 'd',
 'll',
 'm',
 'o',
 're',
 've',
 'y',
 'ain',
 'aren',
 'couldn',
 'didn',
 'doesn',
 'hadn',
 'hasn',
 'haven',
 'isn',
 'ma',
 'mightn',
 'mustn',
 'needn',
 'shan',
 'shouldn',
 'wasn',
 'weren',
 'won',
 'wouldn',
 '!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 '{',
 '|',
 '}',
 '~']
In [34]:
tokens = filter(lambda x: x not in sw, word_tokenize(text))
In [35]:
freq = nltk.FreqDist(tokens)
In [36]:
freq.most_common()
Out[36]:
[('I', 5),
 ('When', 4),
 ("'d", 3),
 ('heard', 2),
 ('astronomer', 2),
 ('time', 2),
 ('learn', 1),
 ('proofs', 1),
 ('figures', 1),
 ('ranged', 1),
 ('columns', 1),
 ('shown', 1),
 ('charts', 1),
 ('diagrams', 1),
 ('add', 1),
 ('divide', 1),
 ('measure', 1),
 ('sitting', 1),
 ('lectured', 1),
 ('much', 1),
 ('applause', 1),
 ('lecture-room', 1),
 ('How', 1),
 ('soon', 1),
 ('unaccountable', 1),
 ('became', 1),
 ('tired', 1),
 ('sick', 1),
 ('Till', 1),
 ('rising', 1),
 ('gliding', 1),
 ('wander', 1),
 ('In', 1),
 ('mystical', 1),
 ('moist', 1),
 ('night-air', 1),
 ('Look', 1),
 ('perfect', 1),
 ('silence', 1),
 ('stars', 1)]
In [37]:
%matplotlib inline
In [38]:
fig = plt.figure()
freq.plot(30)
In [39]:
plot_path = join(target_folder, 'plot.png')
fig.savefig(plot_path)
print('wordcount_image_path = %s' % plot_path)
wordcount_image_path = ./plot.png
In [45]:
df = DataFrame(freq.most_common(), columns=['word', 'count'])
df = df.set_index('word')
df.head()
Out[45]:
<style> .dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; } </style>
count
word
I 5
When 4
'd 3
heard 2
astronomer 2
In [44]:
table_path = join(target_folder, 'word_count.csv')
df.to_csv(table_path)
print('wordcount_table_path = %s' % table_path)
word_count_table_path = ./word_count.csv