import nltk, itertools, pickle, math, string # from nltk.corpus import gutenberg from nltk.corpus import brown # from nltk.corpus import reuters from nltk.corpus import stopwords import numpy IGNORE = stopwords.words('english') + list(string.punctuation) + [',"', '``', "''", '--'] def ignore_string(string): return string in IGNORE or string.isdigit() def svd(vectors): """Do SVD on the m vectors of n values, return the three component matrices.""" # Convert vectors to numpy array of floats array = numpy.array(vectors, dtype=float) # U: m X m, sigma: vector of n singular values, Vt: n X n U, sigma, Vt = numpy.linalg.svd(array) # U, sigma, Vt = numpy.linalg.svd(numpy.transpose(array)) # Convert sigma to diagonal matrix (m X n) Sigma = numpy.zeros_like(array) n = min(array.shape) Sigma[:n,:n] = numpy.diag(sigma) return U, Sigma, Vt def svd_compress(vectors, dims): """Run SVD on vectors and compress to top dims dimensions.""" # Assume Vt can be safely ignored in the compression. U, Sigma, Vt = svd(vectors) # First dims columns of U UR = U[:,:dims] # Limit Sigma to square matrix of width dims SigmaR = Sigma[:dims,:dims] # First dims rows of Vt # VtR = Vt[:dims,:] # return numpy.dot(UR, numpy.dot(SR, VtR)) # return numpy.transpose(numpy.dot(SigmaR, VtR)) return numpy.dot(UR, SigmaR) def seq_indices(seq, obj): """Return list of all indices of obj in seq.""" res = [] i = -1 try: while True: i = seq.index(obj, i+1) res.append(i) except ValueError: return res class WSD(object): """Words, contexts, senses.""" def __init__(self, ambig_words, corpus=brown, fileids=None, n_context_words=5000, n_global_dims=1000, n_compress_dims=100): self.n_context_words = n_context_words self.n_global_dims = n_global_dims self.n_compress_dims = n_compress_dims print "Loading corpus", corpus # This is needed to calculate inverse doc frequency self.docs = [set([w.lower() for sent in para for w in sent]) for para in corpus.paras(fileids)] self.log_n_docs = math.log(len(self.docs)) self.doc_words = [word for doc in self.docs for word in doc] # The actual corpus self.text = [w.lower() for w in corpus.words(fileids)] print "Getting context words and dimensions" self.context_words, self.global_dims = self.get_global() self.word_vecs = [] self.idfs = [] print "Setting inverse document frequencies of context words" self.set_idfs() print "Setting representations for context words" self.make_words() print "Compressing words" self.compress_words() print "Creating ambiguous word objects" self.ambig_words = [Word(word, self) for word in ambig_words] print "Making contexts for ambiguous words" self.context_vecs = {} self.word_contexts = {} self.set_contexts() print "Making senses for ambiguous words" self.make_senses() def global_word_counts(self): """Return a list of word, count pairs sorted by count.""" words = {} for word in self.text: if not ignore_string(word): words[word] = words.get(word, 0) + 1 word_list = words.items() word_list.sort(cmp=lambda x, y: cmp(y[1], x[1])) return word_list def get_global(self): """Return lists of n_context_words and n_global_dims most frequent words.""" words = [w[0] for w in self.global_word_counts()] return words[:self.n_context_words], words[:self.n_global_dims] def make_words(self): """Make representations for context words.""" # self.word_vecs = [Word(word, self).make_repr() for word in self.context_words] n = 0 for word in self.context_words: w = Word(word, self) self.word_vecs.append(w.make_repr()) n += 1 if n % 100 == 0: print "Made", n, "word representations" def get_word_vec(self, form, weight=False): if form in self.context_words: index = self.context_words.index(form) w_repr = self.word_vecs[index] if weight: w_repr = self.idfs[index] * w_repr return w_repr def compress_words(self): self.word_vecs = svd_compress(self.word_vecs, self.n_compress_dims) def set_idfs(self): """Set the inverse document frequency for context words.""" for word in self.context_words: count = self.doc_words.count(word) self.idfs.append(self.log_n_docs - math.log(count)) def set_contexts(self): """Make context vectors for ambiguous words.""" for word in self.ambig_words: print "Making contexts for", word.form word.set_contexts() # self.context_vecs[word] = w.context_vecs # self.word_contexts[word] = w.word_contexts def make_senses(self): for word in self.ambig_words: print "Making senses for", word.form word.make_senses() def pickle(self): file = open('wsd.pkl', 'w') pickle.dump(self, file) file.close() @staticmethod def unpickle(): file = open('wsd.pkl') wsd = pickle.load(file) file.close() return wsd class Word(object): """Vector representations of words.""" def __init__(self, form, wsd=None, w_width=25, n_clusters=3): self.form = form self.wsd = wsd self.w_width = w_width self.n_clusters = n_clusters self.repr = [] self.word_contexts = [] self.context_vecs = [] self.context_dict = {} # Positions of word in corpus self.indices = seq_indices(wsd.text, form) if wsd else [] # clusterer for senses; only needed for ambiguous words self.clusterer = None # cluster dendrogram self.dendrogram = None # Sense objects self.senses = [] def __repr__(self): return "" % (self.form,) def get_word_context(self, text, index): return text[max(0, index-self.w_width):index] + \ text[index+1:index+self.w_width+1] def set_word_contexts(self): text = self.wsd.text self.word_contexts = [self.get_word_context(text, index) for index in self.indices] def set_contexts(self): vec_width = self.wsd.n_compress_dims w_width = self.w_width * 2 wsd = self.wsd if not self.word_contexts: self.set_word_contexts() self.context_vecs = [Context(wc, vec_width, w_width, wsd).vector for wc in self.word_contexts] self.make_context_dict() def make_repr(self): words = self.wsd.global_dims text = self.wsd.text vec = [0 for x in range(len(words))] for index in self.indices: window = self.get_word_context(text, index) for word in window: if word in words: word_index = words.index(word) vec[word_index] += 1 return vec def cluster_contexts(self): self.clusterer = nltk.cluster.gaac.GAAClusterer(self.n_clusters) self.clusterer.cluster(self.context_vecs) self.dendrogram = self.clusterer.dendrogram() def make_senses(self): if not self.clusterer: self.cluster_contexts() for contexts, centroid in zip(self.dendrogram.groups(self.n_clusters), self.clusterer._centroids): self.senses.append(self.make_sense(contexts, centroid)) def make_sense(self, context_vecs, centroid): word_contexts = [self.context_dict[tuple(vec)] for vec in context_vecs] sense = Sense(self, context_vecs, word_contexts, centroid) return sense ## def show_senses(self): ## if self.dendrogram: ## labels = [str(x) for x in range(len(self.context_vecs))] ## self.dendrogram.show(leaf_labels=labels) def get_clusters(self, n): if self.dendrogram: return self.dendrogram.groups(n) def distance(self, v1, v2): return nltk.util.cosine_distance(v1, v2) def make_context_dict(self): for vec, words in zip(self.context_vecs, self.word_contexts): tupvec = tuple(vec) self.context_dict[tupvec] = words class Context(object): """Vector representations of contexts of word tokens.""" def __init__(self, word_context, vec_width, w_width, wsd): self.word_context = word_context self.vec_width = vec_width self.w_width = w_width self.wsd = wsd self.vector = self.make_vector() def make_vector(self): array = numpy.zeros(self.vec_width) for word in self.word_context: word_repr = self.wsd.get_word_vec(word, True) if word_repr != None: array = word_repr + array array /= self.w_width return array class Sense(object): """Cluster of context vectors, with a centroid and associated context words.""" def __init__(self, word, context_vecs, word_contexts, centroid): self.word = word self.context_vecs = context_vecs self.word_contexts = word_contexts self.centroid = centroid wsd = WSD(['plant', 'suit']) ##suit = Word('suit', wsd=wsd) ##interest = Word('interest', wsd=wsd) ##A1 = [[3, 2, 4, 3], [4, 2, 4, 4], [4, 6, 10, 12], [3, 5, 9, 11]] ##A2 = [[3, 2, 4, 3], [3, 5, 9, 11], [4, 2, 4, 4], [4, 6, 10, 12]] ##def cooccur(words, sentence): ## for w in words: ## if w not in sentence: ## return False ## return True ## ##def sents_with_word(corpus, word, fileids=None): ## return [s for s in corpus.sents(fileids) if word in s] ## ##def word_in_para(word, para): ## for sent in para: ## if word in sent: ## return True ## return False ## ##def paras_with_word(corpus, word, fileids=None): ## return [p for p in corpus.paras(fileids) if word_in_para(word, p)] ## ##def cooccur_sent(corpus, words, fileids=None): ## text = corpus.sents(fileids) ## n_sents = len(text) ## n_cooc = len([s for s in text if cooccur(words, s)])