from words import *
from nltk.wordnet import *
from operator import itemgetter
import nltk
import re
from string import join
def build_word_associations():
cfd = nltk.ConditionalFreqDist()
# get a list of all English stop words
stopwords_list = nltk.corpus.stopwords.words('english')
# count words that occur within a window of size 5 ahead of other words
for sentence in nltk.corpus.brown.tagged_sents():
sentence = [(token.lower(), tag) for (token, tag) in sentence if token.lower() not in stopwords_list]
for (index, (token, tag)) in enumerate(sentence):
if token not in stopwords_list:
window = sentence[index+1:index+5]
for (window_token, window_tag) in window:
if window_token not in stopwords_list and window_tag[0] is tag[0]:
cfd[token].inc(window_token)
return cfd
def associate():
while True:
word = raw_input("Enter a word: ")
for i in range(100):
next = cfd[word].max()
if next:
print "->", next,
word = next
else:
break
print
def build_word_contexts(words):
contexts_to_words = {}
words = [w.lower() for w in words]
for i in range(1,len(words)-1):
context = words[i-1]+"_"+words[i+1]
if context not in contexts_to_words:
contexts_to_words[context] = []
contexts_to_words[context].append(words[i])
# inverted structure, tracking frequency
words_to_contexts = {}
for context in contexts_to_words:
for word in contexts_to_words[context]:
if word not in words_to_contexts:
words_to_contexts[word] = []
words_to_contexts[word].append(context)
return words_to_contexts, contexts_to_words
def search_contexts(words):
words_to_contexts, contexts_to_words = build_word_contexts(words)
while True:
hits = []
word = raw_input("word> ")
if word not in words_to_contexts:
print "Word not found"
continue
contexts = words_to_contexts[word]
for w in words_to_contexts: # all words
for context in words_to_contexts[w]:
if context in contexts:
hits.append(w)
hit_freqs = count_words(hits).items()
sorted_hits = sorted(hit_freqs, key=itemgetter(1), reverse=True)
words = [word for (word, count) in sorted_hits[1:] if count > 1]
print join(words)
def lookup(word):
for category in [N, V, ADJ, ADV]:
if word in category:
for synset in category[word]:
print category[word], ":", synset.gloss
############################################
# Simple Tagger
############################################
# map brown pos tags
# http://khnt.hit.uib.no/icame/manuals/brown/INDEX.HTM
def map1(tag):
tag = re.sub(r'fw-', '', tag) # foreign words
tag = re.sub(r'-[th]l', '', tag) # headlines, titles
tag = re.sub(r'-nc', '', tag) # cited
tag = re.sub(r'ber?', 'vb', tag) # verb "to be"
tag = re.sub(r'hv', 'vb', tag) # verb "to have"
tag = re.sub(r'do', 'vb', tag) # verb "to do"
tag = re.sub(r'nc', 'nn', tag) # cited word
tag = re.sub(r'z', '', tag) # third-person singular
return tag
def map2(tag):
tag = re.sub(r'\bj[^-+]*', 'J', tag) # adjectives
tag = re.sub(r'\bp[^-+]*', 'P', tag) # pronouns
tag = re.sub(r'\bm[^-+]*', 'M', tag) # modals
tag = re.sub(r'\bq[^-+]*', 'Q', tag) # qualifiers
tag = re.sub(r'\babl', 'Q', tag) # qualifiers
tag = re.sub(r'\bab[nx]', 'D', tag) # determiners
tag = re.sub(r'\bap', 'D', tag) # determiners
tag = re.sub(r'\bd[^-+]*', 'D', tag) # determiners
tag = re.sub(r'\bat', 'D', tag) # determiners
tag = re.sub(r'\bw[^-+]*', 'W', tag) # wh words
tag = re.sub(r'\br[^-+]*', 'R', tag) # adverbs
tag = re.sub(r'\bto', 'T', tag) # "to"
tag = re.sub(r'\bc[cs]', 'C', tag) # conjunctions
tag = re.sub(r's', '', tag) # plurals
tag = re.sub(r'\bin', 'I', tag) # prepositions
tag = re.sub(r'\buh', 'U', tag) # interjections (uh)
tag = re.sub(r'\bex', 'E', tag) # existential "there"
tag = re.sub(r'\bvbn', 'VN', tag) # past participle
tag = re.sub(r'\bvbd', 'VD', tag) # past tense
tag = re.sub(r'\bvbg', 'VG', tag) # gerund
tag = re.sub(r'\bvb', 'V', tag) # verb
tag = re.sub(r'\bnn', 'N', tag) # noun
tag = re.sub(r'\bnp', 'NP', tag) # proper noun
tag = re.sub(r'\bnr', 'NR', tag) # adverbial noun
tag = re.sub(r'\bex', 'E', tag) # existential "there"
tag = re.sub(r'\bod', 'OD', tag) # ordinal
tag = re.sub(r'\bcd', 'CD', tag) # cardinal
tag = re.sub(r'-t', '', tag) # misc
tag = re.sub(r'[a-z\*]', '', tag) # misc
return tag
def map(tag):
return map2(map1(tag.lower()))
# print sorted(set(map2(map1(tag)) for s in brown.tagged() for w,tag in s))
def load_brown_corpus(sections):
global map
corpus = nltk.corpus.brown.tagged_sents(tuple(sections))
return [[(w.lower(), map(t)) for w, t in sent] for sent in corpus]
def train_tagger(corpus):
t0 = nltk.tag.Default('N')
t1 = nltk.tag.Unigram(cutoff=0, backoff=t0)
t2 = nltk.tag.Bigram(cutoff=0, backoff=t1)
t3 = nltk.tag.Trigram(cutoff=1, backoff=t2)
t1.train(corpus, verbose=True)
t2.train(corpus, verbose=True)
t3.train(corpus, verbose=True)
return t3
def tag(corpus):
print "Training tagger..."
tagger = train_tagger(corpus)
while True:
text = raw_input("sentence> ")
words = text.split()
print join(word+"/"+tag for word, tag in tagger.tag(words))
WORD_OR_TAG = '[^/ ]+'
BOUNDARY = r'\b'
def process(pattern):
new = []
for term in pattern.split():
if re.match('[A-Z]+$', term):
new.append(BOUNDARY + WORD_OR_TAG + '/' + term + BOUNDARY)
elif '/' in term:
new.append(BOUNDARY + term + BOUNDARY)
else:
new.append(BOUNDARY + term + '/' + WORD_OR_TAG + BOUNDARY)
return join(new)
def search(corpus, num=25):
print "Loading corpus..."
strings = [join(w+'/'+t for (w,t) in sent) for sent in corpus]
while True:
pattern = ""
while not pattern:
pattern = raw_input("search> ")
pattern = process(pattern)
i = 0
for sent in strings:
m = re.search(pattern, sent)
if m:
sent = ' '*35 + sent + ' '*45
print sent[m.start():m.start()+80]
i += 1
if i > num:
break
############################################
# Wordnet Browser
# now incorporated into NLTK as wordnet.browse
############################################
############################################
# Mad Libs
############################################
madlib = """Britney Spears will meet up with her %(NP)s label for
crisis talks about the future of her %(N)s this week reports Digital Spy.
%(NP)s Records plan to tell Spears to stop %(VG)s and take more
care of her %(J)s image if she wants to avoid being %(VD)s by the noun.
The news %(V)s shortly after Britney posted a message on her
website promising a new album and tour. The last couple of years
have been quite a ride for me, the media has criticized %(P)s every
noun %(C)s printed a skewed perception of who I really am as a human
being, she wrote in a letter posted %(NR)s."""
# mapping = {}
# mapping['NP'] =
# mapping['N'] =
# mapping['VG'] =
# mapping['J'] =
# mapping['VD'] =
# mapping['V'] =
# mapping['P'] =
# mapping['C'] =
# mapping['NR'] =
# print madlib % mapping