import nltk, re, pprint def read(filename): """Takes file formatted like sentences, returns list of pairs, one for each sentence.""" raw = open(filename).read() lines = raw.split('\n')[::-1] data = [] while lines: line = lines.pop().split('#')[0].strip() # strip comments if not line: continue words = line.lower().replace('.', '').split() tags = lines.pop().split('#')[0].strip().split() glosses = lines.pop().split('#')[0].strip().split() trans = lines.pop().split('#')[0].strip() new_item = zip(words, tags, glosses), trans data.append(new_item) continue return data def sents(data): """Takes list like that returned by read(), returns list of lists of word/tag pairs.""" return [[word[:2] for word in sentence[0]] for sentence in data] data = read('sentences') sentences = sents(data) grammar = r""" NP: {?*
?} # Chunk sequences of NN, JJ, DT NPR: {?} # Chunk NPs with relative clauses SRel: {??} # Chunk sequences of VBR and 0-2 NPs """ cp = nltk.RegexpParser(grammar, loop=2) for s in sentences: print cp.parse(s)