# Word Similarity Notes

## Stemming with nltk

In [1]:
import nltk

In [2]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

In [4]:
stemmer.stem("argumentation")

'argument'

In [5]:
stemmer.stem("creation")

'creation'

In [6]:
stemmer.stem("fly")

'fli'

In [7]:
stemmer.stem("flies")

'fli'

In [8]:
stemmer.stem("laziness")

'lazi'

In [9]:
stemmer.stem("lazy")

'lazi'

In [10]:
stemmer.stem("traditional")

'tradit'

In [11]:
stemmer.stem("create")

'creat'

In [12]:
stemmer.stem("creation")

'creation'

In [13]:
stemmer.stem("creating")

'creat'

In [14]:
stemmer.stem("decisiveness")

'decis'

In [15]:
stemmer.stem("public")

'public'

In [16]:
stemmer.stem("publicize")

'public'

In [17]:
stemmer.stem("xyzing")

'xyze'

## Lemmatizing with WordNet

In [18]:
from nltk.corpus import wordnet as wn

In [222]:
wn.synsets("dog")

[Synset('dog.n.01'),
 Synset('frump.n.01'),
 Synset('dog.n.03'),
 Synset('cad.n.01'),
 Synset('frank.n.02'),
 Synset('pawl.n.01'),
 Synset('andiron.n.01'),
 Synset('chase.v.01')]

In [212]:
d = wn.synset("dog.n.01")

In [213]:
d.definition()

'a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds'

In [214]:
d.lemmas()

[Lemma('dog.n.01.dog'),
 Lemma('dog.n.01.domestic_dog'),
 Lemma('dog.n.01.Canis_familiaris')]

In [243]:
d = wn.synset('able.a.01')
d1 = d.lemmas()[0]

In [244]:
d1.antonyms()

[Lemma('unable.a.01.unable')]

In [256]:
dog = wn.synset("dog.n.01")
cool = wn.synset("cool.n.01")

In [257]:
dog.path_similarity(cool)

0.07692307692307693

In [258]:
dog.hypernyms()

[Synset('canine.n.02'), Synset('domestic_animal.n.01')]

In [259]:
list(dog.closure(lambda s : s.hypernyms()))

[Synset('canine.n.02'),
 Synset('domestic_animal.n.01'),
 Synset('carnivore.n.01'),
 Synset('animal.n.01'),
 Synset('placental.n.01'),
 Synset('organism.n.01'),
 Synset('mammal.n.01'),
 Synset('living_thing.n.01'),
 Synset('vertebrate.n.01'),
 Synset('whole.n.02'),
 Synset('chordate.n.01'),
 Synset('object.n.01'),
 Synset('physical_entity.n.01'),
 Synset('entity.n.01')]

In [260]:
list(cool.closure(lambda s : s.hypernyms()))

[Synset('coldness.n.03'),
 Synset('temperature.n.01'),
 Synset('vasoconstrictor.n.01'),
 Synset('fundamental_quantity.n.01'),
 Synset('physical_property.n.01'),
 Synset('agent.n.01'),
 Synset('measure.n.02'),
 Synset('property.n.02'),
 Synset('causal_agent.n.01'),
 Synset('abstraction.n.06'),
 Synset('attribute.n.02'),
 Synset('physical_entity.n.01'),
 Synset('entity.n.01')]

In [332]:
from nltk.stem import WordNetLemmatizer
 
wnl = WordNetLemmatizer()
print(wnl.lemmatize('running', wn.VERB))
print(wnl.lemmatize('better', wn.ADJ))
print(wnl.lemmatize('oxen', wn.NOUN))
print(wnl.lemmatize('geese', wn.NOUN))

run
good
ox
goose


## Word Vectors with Gensim and Word2Vec

In [261]:
import gensim

In [262]:
import logging

In [263]:
logging.basicConfig(format="%(asctime)s: %(levelname)s : %(message)s", level=logging.INFO)

In [264]:
from gensim.models import word2vec

In [153]:
sentences = word2vec.PathLineSentences("WOTclean")

2018-02-08 15:08:24,867: INFO : reading directory WOTclean/
2018-02-08 15:08:24,917: INFO : files read into PathLineSentences:WOTclean/CrossroadsOfTwilight.txt
WOTclean/CrownOfSwords.txt
WOTclean/DragonReborn.txt
WOTclean/EyeOfTheWorld.txt
WOTclean/FiresOfHeaven.txt
WOTclean/GatheringStorm.txt
WOTclean/GreatHunt.txt
WOTclean/KnifeOfDreams.txt
WOTclean/LordOfChaos.txt
WOTclean/MemoryOfLight.txt
WOTclean/PathOfDaggers.txt
WOTclean/ShadowRising.txt
WOTclean/TowersOfMidnight.txt
WOTclean/WintersHeart.txt


In [154]:
model = word2vec.Word2Vec(sentences, size=200)

2018-02-08 15:10:06,412: INFO : collecting all words and their counts
2018-02-08 15:10:06,415: INFO : reading file WOTclean/CrossroadsOfTwilight.txt
2018-02-08 15:10:06,458: INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-02-08 15:10:06,521: INFO : reading file WOTclean/CrownOfSwords.txt
2018-02-08 15:10:06,622: INFO : reading file WOTclean/DragonReborn.txt
2018-02-08 15:10:06,705: INFO : reading file WOTclean/EyeOfTheWorld.txt
2018-02-08 15:10:06,801: INFO : reading file WOTclean/FiresOfHeaven.txt
2018-02-08 15:10:06,919: INFO : reading file WOTclean/GatheringStorm.txt
2018-02-08 15:10:07,022: INFO : reading file WOTclean/GreatHunt.txt
2018-02-08 15:10:07,117: INFO : reading file WOTclean/KnifeOfDreams.txt
2018-02-08 15:10:07,234: INFO : reading file WOTclean/LordOfChaos.txt
2018-02-08 15:10:07,370: INFO : reading file WOTclean/MemoryOfLight.txt
2018-02-08 15:10:07,487: INFO : reading file WOTclean/PathOfDaggers.txt
2018-02-08 15:10:07,571: INFO : reading

2018-02-08 15:10:24,626: INFO : reading file WOTclean/GreatHunt.txt
2018-02-08 15:10:24,831: INFO : reading file WOTclean/KnifeOfDreams.txt
2018-02-08 15:10:25,053: INFO : reading file WOTclean/LordOfChaos.txt
2018-02-08 15:10:25,355: INFO : reading file WOTclean/MemoryOfLight.txt
2018-02-08 15:10:25,594: INFO : reading file WOTclean/PathOfDaggers.txt
2018-02-08 15:10:25,632: INFO : PROGRESS: at 94.12% examples, 880445 words/s, in_qsize 6, out_qsize 0
2018-02-08 15:10:25,762: INFO : reading file WOTclean/ShadowRising.txt
2018-02-08 15:10:26,030: INFO : reading file WOTclean/TowersOfMidnight.txt
2018-02-08 15:10:26,264: INFO : reading file WOTclean/WintersHeart.txt
2018-02-08 15:10:26,493: INFO : worker thread finished; awaiting finish of 2 more threads
2018-02-08 15:10:26,495: INFO : worker thread finished; awaiting finish of 1 more threads
2018-02-08 15:10:26,500: INFO : worker thread finished; awaiting finish of 0 more threads
2018-02-08 15:10:26,501: INFO : training on 21666845 raw 

In [333]:
model.wv.most_similar("sleep")

[('drink', 0.6279441118240356),
 ('eat', 0.6085196137428284),
 ('tonight', 0.5919203758239746),
 ('get', 0.557205080986023),
 ('talk', 0.545413613319397),
 ('go', 0.5423065423965454),
 ('wait', 0.5408374667167664),
 ('leave', 0.5343519449234009),
 ('die', 0.5315142273902893),
 ('finish', 0.525676965713501)]

In [306]:
model.wv.most_similar("sword")

[('blade', 0.8039987683296204),
 ('axe', 0.7913359999656677),
 ('dagger', 0.7496095895767212),
 ('hammer', 0.7433908581733704),
 ('spear', 0.7332878708839417),
 ('knife', 0.6845142841339111),
 ('ashandarei', 0.6808995008468628),
 ('bow', 0.6612876653671265),
 ('callandor', 0.6592528223991394),
 ('medallion', 0.6419517993927002)]

In [307]:
model.wv.most_similar(positive=["king", "woman"], negative=["man"])

[('daughter', 0.7457714080810547),
 ('queen', 0.7173532247543335),
 ('empress', 0.7020495533943176),
 ('daughterheir', 0.6820720434188843),
 ('trakand', 0.6753557920455933),
 ('panarch', 0.6540794372558594),
 ('damodred', 0.6532011032104492),
 ('throne', 0.6345314383506775),
 ('eldest', 0.6304309368133545),
 ('andor', 0.6277015209197998)]