## Natural Language Processing with NLTK

In [1]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
import nltk

## Tokenization

1. Tokenize by word
2. Tokenize by sentence

In [3]:
nltk.download('punkt')

# import nltk tokenize package for both sentence and word tokenizer
from nltk.tokenize import sent_tokenize, word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Asmaliza\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# sample string to test tokenize
example_string = """
    Muad'Dib learned rapidly because his first training was in how to learn.
    And the first lesson of all was the basic trust that he could learn.
    It's shocking to find how many people do not believe they can learn,
    and how many more believe learning to be difficult."""

In [5]:
# tokenize by word
word_tokenize(example_string)

["Muad'Dib",
 'learned',
 'rapidly',
 'because',
 'his',
 'first',
 'training',
 'was',
 'in',
 'how',
 'to',
 'learn',
 '.',
 'And',
 'the',
 'first',
 'lesson',
 'of',
 'all',
 'was',
 'the',
 'basic',
 'trust',
 'that',
 'he',
 'could',
 'learn',
 '.',
 'It',
 "'s",
 'shocking',
 'to',
 'find',
 'how',
 'many',
 'people',
 'do',
 'not',
 'believe',
 'they',
 'can',
 'learn',
 ',',
 'and',
 'how',
 'many',
 'more',
 'believe',
 'learning',
 'to',
 'be',
 'difficult',
 '.']

In [6]:
sent_tokenize(example_string)

["\n    Muad'Dib learned rapidly because his first training was in how to learn.",
 'And the first lesson of all was the basic trust that he could learn.',
 "It's shocking to find how many people do not believe they can learn,\n    and how many more believe learning to be difficult."]

## Filtering Stop Words

In [7]:
# download stopwords, then import
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asmaliza\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# sample string
worf_quote = "Sir, I protest. I am not a merry man!"

In [9]:
# tokenize by word
words_in_quote = word_tokenize(worf_quote)
words_in_quote

['Sir', ',', 'I', 'protest', '.', 'I', 'am', 'not', 'a', 'merry', 'man', '!']

In [10]:
# set stop words in english
stop_words = set(stopwords.words("english"))

# create an empty list to store words that are not stop words (result)
filtered_list = []

In [11]:
# filter out the stop words
for word in words_in_quote:
    if word.casefold() not in stop_words:
        filtered_list.append(word)

# display results
filtered_list

['Sir', ',', 'protest', '.', 'merry', 'man', '!']

## Stemming

In [12]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [13]:
# create a stemmer
stemmer = PorterStemmer()

# sample string
string_for_stemming = """
    The crew of the USS Discovery discovered many discoveries.
    Discovering is what explorers do."""

# tokenize by word - without stemmer
words = word_tokenize(string_for_stemming)
words

['The',
 'crew',
 'of',
 'the',
 'USS',
 'Discovery',
 'discovered',
 'many',
 'discoveries',
 '.',
 'Discovering',
 'is',
 'what',
 'explorers',
 'do',
 '.']

In [14]:
# stemming using PorterStemmer
stemmed_words = [stemmer.stem(word) for word in words]
stemmed_words

['the',
 'crew',
 'of',
 'the',
 'uss',
 'discoveri',
 'discov',
 'mani',
 'discoveri',
 '.',
 'discov',
 'is',
 'what',
 'explor',
 'do',
 '.']

## Tagging Parts of Speech

In [15]:
# sample string
sagan_quote = """
    If you wish to make an apple pie from scratch,
    you must first invent the universe."""

# tokenize by word
words_in_sagan_quote = word_tokenize(sagan_quote)
words_in_sagan_quote

['If',
 'you',
 'wish',
 'to',
 'make',
 'an',
 'apple',
 'pie',
 'from',
 'scratch',
 ',',
 'you',
 'must',
 'first',
 'invent',
 'the',
 'universe',
 '.']

In [16]:
# download and install tagger
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Asmaliza\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\Asmaliza\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [17]:
# begin tagging
nltk.pos_tag(words_in_sagan_quote)

[('If', 'IN'),
 ('you', 'PRP'),
 ('wish', 'VBP'),
 ('to', 'TO'),
 ('make', 'VB'),
 ('an', 'DT'),
 ('apple', 'NN'),
 ('pie', 'NN'),
 ('from', 'IN'),
 ('scratch', 'NN'),
 (',', ','),
 ('you', 'PRP'),
 ('must', 'MD'),
 ('first', 'VB'),
 ('invent', 'VB'),
 ('the', 'DT'),
 ('universe', 'NN'),
 ('.', '.')]

In [18]:
# get descriptions of tags
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [19]:
# some gibberish
jabberwocky_excerpt = """
    'Twas brillig, and the slithy toves did gyre and gimble in the wabe:
    all mimsy were the borogoves, and the mome raths outgrabe."""

# tokenize
words_in_excerpt = word_tokenize(jabberwocky_excerpt)
words_in_excerpt

# tag
nltk.pos_tag(words_in_excerpt)

[("'T", 'NN'),
 ('was', 'VBD'),
 ('brillig', 'VBN'),
 (',', ','),
 ('and', 'CC'),
 ('the', 'DT'),
 ('slithy', 'JJ'),
 ('toves', 'NNS'),
 ('did', 'VBD'),
 ('gyre', 'NN'),
 ('and', 'CC'),
 ('gimble', 'JJ'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('wabe', 'NN'),
 (':', ':'),
 ('all', 'DT'),
 ('mimsy', 'NNS'),
 ('were', 'VBD'),
 ('the', 'DT'),
 ('borogoves', 'NNS'),
 (',', ','),
 ('and', 'CC'),
 ('the', 'DT'),
 ('mome', 'JJ'),
 ('raths', 'NNS'),
 ('outgrabe', 'RB'),
 ('.', '.')]

## Lemmatizing

In [20]:
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Asmaliza\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [21]:
from nltk.stem import WordNetLemmatizer

# create a lemmatizer
lemmatizer = WordNetLemmatizer()

# test
lemmatizer.lemmatize("scarves")

'scarf'

In [22]:
words

['The',
 'crew',
 'of',
 'the',
 'USS',
 'Discovery',
 'discovered',
 'many',
 'discoveries',
 '.',
 'Discovering',
 'is',
 'what',
 'explorers',
 'do',
 '.']

In [23]:
# lemmatizing instead of stemming
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
lemmatized_words

['The',
 'crew',
 'of',
 'the',
 'USS',
 'Discovery',
 'discovered',
 'many',
 'discovery',
 '.',
 'Discovering',
 'is',
 'what',
 'explorer',
 'do',
 '.']

In [24]:
# sample string
string_for_lemmatizing = "The friend of DeSoto loves scarves."

# tokenize by word
words = word_tokenize(string_for_lemmatizing)
words

# lemmatize
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
lemmatized_words

['The', 'friend', 'of', 'DeSoto', 'love', 'scarf', '.']

In [25]:
# But what would happen if you lemmatized a word that looked very different from its lemma? Try lemmatizing "worst":
# bad -> worse -> worst
lemmatizer.lemmatize("worst")

'worst'

In [26]:
# You got the result 'worst' because lemmatizer.lemmatize() assumed that "worst" was a noun. You can make it clear that you want "worst" to be an adjective:
lemmatizer.lemmatize("worst", pos="a")

'bad'

In [27]:
lemmatizer.lemmatize("best", pos="n")

'best'

## Chunking

In [28]:
# sample string
lotr_quote = "It's a dangerous business, Frodo, going out your door."

# tokenize by word
words_in_lotr_quote = word_tokenize(lotr_quote)
words_in_lotr_quote

['It',
 "'s",
 'a',
 'dangerous',
 'business',
 ',',
 'Frodo',
 ',',
 'going',
 'out',
 'your',
 'door',
 '.']

In [29]:
# tag word by part of speech
nltk.download("averaged_perceptron_tagger")
lotr_pos_tags = nltk.pos_tag(words_in_lotr_quote)
lotr_pos_tags

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Asmaliza\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('It', 'PRP'),
 ("'s", 'VBZ'),
 ('a', 'DT'),
 ('dangerous', 'JJ'),
 ('business', 'NN'),
 (',', ','),
 ('Frodo', 'NNP'),
 (',', ','),
 ('going', 'VBG'),
 ('out', 'RP'),
 ('your', 'PRP$'),
 ('door', 'NN'),
 ('.', '.')]

In [30]:
# Create a chunk grammar with one regular expression rule
grammar = "NP: {<DT>?<JJ>*<NN>}"

NP stands for noun phrase.

According to the rule you created, your chunks:

1. Start with an optional (?) determiner ('DT')
2. Can have any number (*) of adjectives (JJ)
3. End with a noun (<NN>)

In [31]:
# Create a chunk parser with this grammar
chunk_parser = nltk.RegexpParser(grammar)

# Now try it out with your quote
tree = chunk_parser.parse(lotr_pos_tags)

# Here’s how you can see a visual representation of this tree
tree.draw()

## Chinking

In [32]:
lotr_pos_tags

[('It', 'PRP'),
 ("'s", 'VBZ'),
 ('a', 'DT'),
 ('dangerous', 'JJ'),
 ('business', 'NN'),
 (',', ','),
 ('Frodo', 'NNP'),
 (',', ','),
 ('going', 'VBG'),
 ('out', 'RP'),
 ('your', 'PRP$'),
 ('door', 'NN'),
 ('.', '.')]

In [33]:
grammar = """
    Chunk: {<.*>+}
           }<JJ>{"""

The first rule of your grammar is {<.*>+}. This rule has curly braces that face inward ({}) because it’s used to determine what patterns you want to include in you chunks. In this case, you want to include everything: <.*>+.

The second rule of your grammar is }<JJ>{. This rule has curly braces that face outward (}{) because it’s used to determine what patterns you want to exclude in your chunks. In this case, you want to exclude adjectives: <JJ>.

In [34]:
# Create a chunk parser with this grammar
chunk_parser = nltk.RegexpParser(grammar)

# Now chunk your sentence with the chink you specified
tree = chunk_parser.parse(lotr_pos_tags)

# draw tree
tree.draw()

## Named Entity Recognition (NER)

In [35]:
# sample string
quote = """
    Men like Schiaparelli watched the red planet—it is odd, by-the-bye, that
    for countless centuries Mars has been the star of war—but failed to
    interpret the fluctuating appearances of the markings they mapped so well.
    All that time the Martians must have been getting ready.

    During the opposition of 1894 a great light was seen on the illuminated
    part of the disk, first at the Lick Observatory, then by Perrotin of Nice,
    and then by other observers. English readers heard of it first in the
    issue of Nature dated August 2."""

In [36]:
# You can use nltk.ne_chunk() to recognize named entities. Let’s use lotr_pos_tags again to test it out:
nltk.download("maxent_ne_chunker")
nltk.download("words")

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\Asmaliza\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Asmaliza\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [37]:
# Now create a function to extract named entities
def extract_ne(quote):
    # tokenize
    words = word_tokenize(quote)
    # pos tag
    tags = nltk.pos_tag(words)
    # chunking
    tree = nltk.ne_chunk(tags, binary=True)
    return set(
        " ".join(i[0] for i in t)
        for t in tree
        if hasattr(t, "label") and t.label() == "NE"
        )

In [38]:
# Take a look at the information you extracted
extract_ne(quote)

{'Lick Observatory', 'Mars', 'Nature', 'Perrotin', 'Schiaparelli'}

You missed the city of Nice, possibly because NLTK interpreted it as a regular English adjective, but you still got the following:

1. An institution: 'Lick Observatory'
2. A planet: 'Mars'
3. A publication: 'Nature'
4. People: 'Perrotin', 'Schiaparelli'

## Get Text to Analyze

In [39]:
# In order to analyze texts in NLTK, you first need to import them. This requires nltk.download("book"), which is a pretty big download:
nltk.download("book")
from nltk.book import *

[nltk_data] Downloading collection 'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\Asmaliza\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package brown to
[nltk_data]    |     C:\Users\Asmaliza\AppData\Roaming\nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package chat80 to
[nltk_data]    |     C:\Users\Asmaliza\AppData\Roaming\nltk_data...
[nltk_data]    |   Package chat80 is already up-to-date!
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\Asmaliza\AppData\Roaming\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package conll2000 to
[nltk_data]    |     C:\Users\Asmaliza\AppData\Roaming\nltk_data...
[nltk_data]    |   Package conll2000 is already up-to-date!
[nltk_data]    | Downloading package conll2002 to
[nltk_data]    |     C:\Users\

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


# Using the Concordance

In [40]:
# The personals corpus is called text8, so we’re going to call .concordance() on it with the parameter "man":
text8.concordance("man")

Displaying 14 of 14 matches:
 to hearing from you all . ABLE young man seeks , sexy older women . Phone for 
ble relationship . GENUINE ATTRACTIVE MAN 40 y . o ., no ties , secure , 5 ft .
ship , and quality times . VIETNAMESE MAN Single , never married , financially 
ip . WELL DRESSED emotionally healthy man 37 like to meet full figured woman fo
 nth subs LIKE TO BE MISTRESS of YOUR MAN like to be treated well . Bold DTE no
eeks lady in similar position MARRIED MAN 50 , attrac . fit , seeks lady 40 - 5
eks nice girl 25 - 30 serious rship . Man 46 attractive fit , assertive , and k
 40 - 50 sought by Aussie mid 40s b / man f / ship r / ship LOVE to meet widowe
discreet times . Sth E Subs . MARRIED MAN 42yo 6ft , fit , seeks Lady for discr
woman , seeks professional , employed man , with interests in theatre , dining 
 tall and of large build seeks a good man . I am a nonsmoker , social drinker ,
lead to relationship . SEEKING HONEST MAN I am 41 y . o ., 5 ft . 4 , med . bui
 quiet time

In [41]:
# Let’s see if there’s a similar pattern with the word "woman":
text8.concordance("woman")

Displaying 11 of 11 matches:
at home . Seeking an honest , caring woman , slim or med . build , who enjoys t
thy man 37 like to meet full figured woman for relationship . 48 slim , shy , S
rry . MALE 58 years old . Is there a Woman who would like to spend 1 weekend a 
 other interests . Seeking Christian Woman for fship , view to rship . SWM 45 D
ALE 60 - burly beared seeks intimate woman for outings n / s s / d F / ston / P
ington . SCORPIO 47 seeks passionate woman for discreet intimate encounters SEX
le dad . 42 , East sub . 5 " 9 seeks woman 30 + for f / ship relationship TALL 
personal trainer looking for married woman age open for fun MARRIED Dark guy 37
rinker , seeking slim - medium build woman who is happy in life , age open . AC
. O . TERTIARY Educated professional woman , seeks professional , employed man 
 real romantic , age 50 - 65 y . o . WOMAN OF SUBSTANCE 56 , 59 kg ., 50 , fit 
