Question: #######run HW2.py##### from __future__ import division import sys from pprint import pprint from collections import defaultdict import nltk from nltk.corpus import treebank from nltk import

#######run HW2.py#####

from __future__ import division import sys from pprint import pprint from collections import defaultdict import nltk from nltk.corpus import treebank from nltk import ConditionalFreqDist, Nonterminal, FreqDist from fixesNLTK3 import * from BetterICP import * from nltk import InsideChartParser

## Main body of code ## # Extracting tagged sentences using NLTK libraries psents = treebank.parsed_sents() # Comment out the following 3 lines if you get tired of seeing them print " 1st parsed sentence: ", psents[0] print " Productions in the 1st parsed sentence: " pprint(psents[0].productions())

grammar = parse_pgrammar(""" # Grammatical productions. S -> NP VP [1.0] NP -> Pro [0.1] | Det N [0.3] | N [0.5] | NP PP [0.1] VP -> Vi [0.05] | Vt NP [0.9] | VP PP [0.05] Det -> Art [1.0] PP -> Prep NP [1.0] # Lexical productions. Pro -> "i" [0.3] | "we" [0.1] | "you" [0.1] | "he" [0.3] | "she" [0.2] Art -> "a" [0.4] | "an" [0.1] | "the" [0.5] Prep -> "with" [0.7] | "in" [0.3] N -> "salad" [0.4] | "fork" [0.3] | "mushrooms" [0.3] Vi -> "sneezed" [0.5] | "ran" [0.5] Vt -> "eat" [0.2] | "eats" [0.2] | "ate" [0.2] | "see" [0.2] | "saw" [0.2] """)

sentence1 = "he ate salad" sentence2 = "he ate salad with mushrooms" sentence3 = "he ate salad with a fork"

# Un-comment the following 2 non-comment lines # when working on `PCFG Parser` section in the lab. ## Initialize a parser with our toy probabilistic grammar ## (it will have 'S' as the start symbol), ## and parse a sentence #sppc=BetterICP(grammar) #sppc.parse(sentence1.split())

## Parse some more complex sentences #sppc.parse(sentence2.split()) #sppc.parse(sentence3.split())

#prods = get_costed_productions(psents) #ppg=PCFG(Nonterminal('NP'), prods) #ppc=BetterICP(ppg,1000) #print "beam = 1000" #ppc.parse("the men".split(),True,3) #ppc.beam(1075) #print "beam = 1075" #ppc.parse("the men".split(),True,3) #ppc.trace(3) #ppc.beam(1200) #print "beam = 1200" #ppc.parse("the men".split(),True,3)

#### fixesNLTK3.py #####

# fix buggy NLTK 3 :-( # different fixes for different versions :-(((( import re, sys import nltk from nltk.grammar import _ARROW_RE, _PROBABILITY_RE, _DISJUNCTION_RE, Production from nltk.draw import CFGEditor from nltk.probability import ImmutableProbabilisticMixIn ARROW = u'\u2192' TOKEN = u'([\\w ]|\\\\((x[0-9a-f][0-9a-f])|(u[0-9a-f][0-9a-f][0-9a-f][0-9a-f])))+' CFGEditor.ARROW = ARROW CFGEditor._TOKEN_RE=re.compile(u"->|u?'"+TOKEN+u"'|u?\""+TOKEN+u"\"|\\w+|("+ARROW+u")") CFGEditor._PRODUCTION_RE=re.compile(ur"(^\s*\w+\s*)" + ur"(->|("+ARROW+"))\s*" + ur"((u?'"+TOKEN+"'|u?\""+TOKEN+"\"|''|\"\"|\w+|\|)\s*)*$") nltk.grammar._TERMINAL_RE = re.compile(ur'( u?"[^"]+" | u?\'[^\']+\' ) \s*', re.VERBOSE) nltk.grammar._ARROR_RE = re.compile(ur'\s* (->|'+ARROW+') \s*', re.VERBOSE)

from nltk.grammar import _TERMINAL_RE

if sys.version_info[0]>2 or sys.version_info[1]>6: from nltk.grammar import PCFG, CFG, ProbabilisticProduction as FixPP parse_grammar=CFG.fromstring parse_pgrammar=PCFG.fromstring from nltk import InsideChartParser def nbest_parse(self,tokens,n=None): parses=self.parse(tokens) if n is None: return [parse for parse in parses] else: return [parses.next() for i in range(n)] InsideChartParser.nbest_parse=nbest_parse else: from nltk.grammar import WeightedGrammar as PCFG, WeightedProduction as FixPP from nltk import parse_cfg, parse_pcfg parse_grammar=parse_cfg parse_pgrammar=parse_pcfg

def fix_parse_production(line, nonterm_parser, probabilistic=False): """ Parse a grammar rule, given as a string, and return a list of productions. """ pos = 0

# Parse the left-hand side. lhs, pos = nonterm_parser(line, pos)

# Skip over the arrow. m = _ARROW_RE.match(line, pos) if not m: raise ValueError('Expected an arrow') pos = m.end()

# Parse the right hand side. probabilities = [0.0] rhsides = [[]] while pos < len(line): # Probability. m = _PROBABILITY_RE.match(line, pos) if probabilistic and m: pos = m.end() probabilities[-1] = float(m.group(1)[1:-1]) if probabilities[-1] > 1.0: raise ValueError('Production probability %f, ' 'should not be greater than 1.0' % (probabilities[-1],))

# String -- add terminal. elif (line[pos] in "\'\"" or line[pos:pos+2] in ('u"',"u'")): m = _TERMINAL_RE.match(line, pos) if not m: raise ValueError('Unterminated string') rhsides[-1].append(eval(m.group(1))) pos = m.end()

# Vertical bar -- start new rhside. elif line[pos] == '|': m = _DISJUNCTION_RE.match(line, pos) probabilities.append(0.0) rhsides.append([]) pos = m.end()

# Anything else -- nonterminal. else: nonterm, pos = nonterm_parser(line, pos) rhsides[-1].append(nonterm)

if probabilistic: return [FixPP(lhs, rhs, prob=probability) for (rhs, probability) in zip(rhsides, probabilities)] else: return [Production(lhs, rhs) for rhs in rhsides]

if sys.version_info[0]>2 or sys.version_info[1]>6: nltk.grammar._read_production=fix_parse_production else: nltk.grammar.parse_production=fix_parse_production

class CostedProduction(FixPP): """ A probabilistic context free grammar production using costs. A PCFG ``ProbabilisticProduction`` is essentially just a ``Production`` that has an associated probability, which represents how likely it is that this production will be used. In particular, the probability of a ``ProbabilisticProduction`` records the likelihood that its right-hand side is the correct instantiation for any given occurrence of its left-hand side.

:see: ``Production`` """ def __init__(self, lhs, rhs, cost): """ Construct a new ``ProbabilisticProduction``.

:param lhs: The left-hand side of the new ``ProbabilisticProduction``. :type lhs: Nonterminal :param rhs: The right-hand side of the new ``ProbabilisticProduction``. :type rhs: sequence(Nonterminal and terminal) :param prob: Probability parameters of the new ``ProbabilisticProduction``. """ ImmutableProbabilisticMixIn.__init__(self, logprob=-cost) Production.__init__(self, lhs, rhs)

def __str__(self): return Production.__unicode__(self) + \ (' [0.0]' if (self.logprob() == 0.0) else ' [%g]' % -self.logprob())

def __repr__(self): return '%s'%str(self)

def cost(self): return 0.0 if self.logprob() == 0.0 else -self.logprob()

####BetterICP.py###########

from __future__ import division import sys from pprint import pprint from collections import defaultdict import nltk from nltk.corpus import treebank from nltk import ConditionalFreqDist, Nonterminal, FreqDist from fixesNLTK3 import * from nltk import InsideChartParser from nltk.parse.chart import Chart,AbstractChartRule from nltk.tree import Tree,ProbabilisticTree,_child_names from nltk.parse.pchart import ProbabilisticFundamentalRule,ProbabilisticBottomUpInitRule,ProbabilisticTreeEdge,ProbabilisticLeafEdge from nltk.parse.pchart import SingleEdgeProbabilisticFundamentalRule from math import log

# Renamed between 3.0 and 3.0.4 :-( if not(hasattr(Chart,'pretty_format_edge')): Chart.pretty_format_edge=Chart.pp_edge

# nltk.parse.pchart is fundamentally broken, because it adds edges directly # into the chart, where the fr can see them whether or not they've come # out of the agenda or not.

# The least-bad fix from outside I can come up with is implemented here: # add a boolean var called 'pending' which is true by default, only set to # false when the edge comes off the agenda, and when true causes it # to be ignored by fr

# Possible bug? Even pending edges _are_ checked for when testing for # redundancy (i.e. Chart.insert is _not_ changed), but that means any # failure of best-first might cause a cheaper edge to be discarded # because an earlier, but still pending, identical-but-more expensive # edge is in the chart.

nltk.chart.EdgeI.pending=True

def productions_with_left_context(self,lpos=0,leaves=None): """ Generate the productions that correspond to the non-terminal nodes of the tree, with their left-context word (or None), as pairs of word and Production. For each subtree of the form (P: C1 C2 ... Cn) this produces a production of the form P -> C1 C2 ... Cn and the word to the left of C1

>>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") >>> t.productions() [(None, S -> NP VP), (None, NP -> D N), (None, D -> 'the') ('the', N -> 'dog'), ('dog', VP -> V NP), ('dog', V -> 'chased'), ('dog', NP -> D N), ('chased', D -> 'the'), ('the', N -> 'cat')]

:rtype: list(Production) """ if leaves is None: leaves=self.leaves() #if not isinstance(self._label, string_types): # raise TypeError('Productions can only be generated from trees having node labels that are strings') if lpos>0: lc=leaves[lpos-1] else: lc=None prods = [(lc,Production(Nonterminal(self._label), _child_names(self)))] for child in self: if isinstance(child, Tree): prods += child.productions_with_left_context(lpos,leaves) # could be much smarter lpos+=len(child.leaves()) else: lpos+=1 return prods

Tree.productions_with_left_context=productions_with_left_context

def production_distribution(psents): """ Creates a frequency distribution of lexical and non-lexical (grammatical) productions """ prod_dict = defaultdict(int) for psent in psents: for production in psent.productions(): prod_dict[production]+=1 return prod_dict

def nt_counts(prod_dict): '''Create a dictionary of non-terminals and their counts''' nt_dict=defaultdict(int) for (rule,count) in prod_dict.items(): nt_dict[rule.lhs()]+=count return nt_dict

def cost(prob): return 0.0 if prob==1.0 else -log(prob,2)

def production_cost(production,lhs_counts,production_counts): pcount=production_counts[production] ntcount=lhs_counts[production.lhs()] return cost(float(pcount)/float(ntcount))

def get_costed_productions(psents): """ Creates costed/weighted productions from a given list of parsed sentences.""" prods_dict = production_distribution(psents) prods_nt_counts=nt_counts(prods_dict) costed_prods=[CostedProduction(p.lhs(),p.rhs(),production_cost(p, prods_nt_counts, prods_dict)) for p in prods_dict.keys()] return costed_prods

class BetterPBPR(AbstractChartRule): NUM_EDGES=1 def apply(self, chart, grammar, edge): if edge.is_incomplete(): return for prod in grammar.productions(): if edge.lhs() == prod.rhs()[0]: # check for X -> X if prod.lhs()==edge.lhs() and len(prod.rhs())==1: continue new_edge = ProbabilisticTreeEdge.from_production(prod, edge.start(), prod.prob()) if chart.insert(new_edge, ()): yield new_edge

class BetterSEPFR(AbstractChartRule): NUM_EDGES=1

_fundamental_rule = ProbabilisticFundamentalRule()

def apply(self, chart, grammar, edge1): fr = self._fundamental_rule if edge1.is_incomplete(): # edge1 = left_edge; edge2 = right_edge for edge2 in chart.select(start=edge1.end(), is_complete=True, lhs=edge1.nextsym()): if edge2.pending: continue for new_edge in fr.apply(chart, grammar, edge1, edge2): yield new_edge else: # edge2 = left_edge; edge1 = right_edge for edge2 in chart.select(end=edge1.start(), is_complete=False, nextsym=edge1.lhs()): if edge2.pending: continue for new_edge in fr.apply(chart, grammar, edge2, edge1): yield new_edge

class BetterICP(InsideChartParser): '''Implement a more user-friendly InsideChartParser, which will show intermediate results, and quit after finding a specified number of parses''' def parse(self, tokens, notify=True, max=0): '''Run a probabilistic parse of tokens. If notify is true, display each complete parse as it is found If max>0, quit after finding that many parses''' self._grammar.check_coverage(tokens) chart = Chart(list(tokens)) chart._trace=self._trace # Bad form. . . grammar = self._grammar start = grammar.start() prod_probs = {}

# Chart parser rules. bu_init = ProbabilisticBottomUpInitRule() bu = BetterPBPR() # avoid infinite numbers of parses :-( fr = BetterSEPFR() # don't look at pending edges # Our queue queue = []

# Initialize the chart. for edge in bu_init.apply(chart, grammar): if self._trace > 1: print(' %-50s [%.4g]' % (chart.pretty_format_edge(edge,width=2), cost(edge.prob()))) queue.append(edge)

found = 0 while len(queue) > 0 and (max<1 or found

# Prune the queue to the correct size if a beam was defined if self.beam_size: self._prune(queue, chart)

# Get the best edge. edge = queue.pop() edge.pending = False if self._trace > 0: print(' %-50s [%.4g]' % (chart.pretty_format_edge(edge,width=2), cost(edge.prob()))) if (edge.start()==0 and edge.end()==chart._num_leaves and edge.lhs()==start and edge.is_complete()): if len(prod_probs)==0: for prod in grammar.productions(): prod_probs[prod.lhs(), prod.rhs()] = prod.prob() if notify: print "****" for tree in chart.trees(edge, tree_class=ProbabilisticTree, complete=True): self._setprob(tree, prod_probs) print tree, '%.4g (%.4g)'%(cost(tree.prob()),cost(edge.prob())) #print tree print "****" found+=1 # Apply BU & FR to it. queue.extend(fr.apply(chart, grammar, edge)) queue.extend(bu.apply(chart, grammar, edge))

# Get a list of complete parses. parses = list(chart.parses(grammar.start(), ProbabilisticTree)) if not notify: for parse in parses: self._setprob(parse,prod_probs)

# Sort by probability parses.sort(reverse=True, key=lambda tree: tree.prob()) if notify: print "%s total parses found"%found return iter(parses)

def _prune(self, queue, chart): """ Discard items in the queue if the queue is longer than the beam.""" if len(queue) > self.beam_size: split = len(queue)-self.beam_size if self._trace > 2: for edge in queue[:split]: print(' %-50s [%.4g DISCARDED]' % (chart.pretty_format_edge(edge,2), cost(edge.prob()))) del queue[:split]

def beam(self,width): self.beam_size=width

#################QUESTION###################

Graded Programming exercises: For subparts a-c below, follow the instructions and briefly answer questions (marked in bold). Part d contains a programming question as well as asking for a written summary of your solution. a) Create a PCFG by uncommenting the prods=... and ppg=PCFG(Nonterminal(NP), prods) lines in runHW2.py and re-loading. (The first argument to PCFG is the start symbol and the second argument is the productions with their costs. To see the parts within the returned list, you can look at prods[i].rhs(), prods[i].lhs() and prods[i].cost() . To use a field as a key value as a parameter, you can use lambda, e.g. you can write: key= lambda x: x.cost() ) Now try sprods = ppg.productions(Nonterminal(S)), which gives a list of all the productions that start with S. (Note that unlike lisp, the expression will not automatically print the result; you need to evaluate it separately.) How many productions are there in the treebank that start with S? How many start with NP (or a variant)? (Write short functions to do these calculations.)

b) In BetterICP, the second parameter in the constructor specifies the width of the beam (which is the maximum length of the queue used to represent the chart.) Un-comment the following lines and run the code: ppc=BetterICP(ppg,1000) print "beam = 1000" ppc.parse("the men".split(),True,3) Now comment out the last two of the above three lines, and remove the comments from the next three lines, to widen the beam to 1050, and run again and see what happens. Now, uncomment three more lines to increase the width 1200 (but do not uncomment the trace lines). Describe what happens after changing the beam width and what that signifies in terms of the grammar.

c) Finally, comment out the last three lines and try editing runHW3.py to include three new lines of the form below, where BIGGERNUM is the smallest number big enough to get the simple DT NNS structure one might expect?

ppc.beam(BIGGERNUM) print "beam = BIGGERNUM" ppc.parse("the men".split(),True,3)

What was the smallest value you found for BIGGERNUM that finds the correct parse? Provide a table that shows different beam widths and the number of parses returned, where you include at least one beam value for each possible number.

d. Modify the grammar: Create a new file, runGrammar2.py that defines a new grammar (a modification of the one provided), defines a new function that reads the sentences from an input file, and then runs the parser on the sentences from the file and prints the output to another file called hw2_output.txt. Your updated toy grammar, called grammar2, should add grammatical and lexical productions (or add new items on the RHS of existing rules) as needed to handle: imperative sentences such as "eat the salad", ditransitive sentences, such as I read her the book (but not *I read she the book) passive sentences, such as The book was read to her (but not *I read the book to she)

You should add a new category of pronoun, Pro_obj, for pronouns that can be indirect objects or objects of prepositions (such as him, her, them) but not subjects. Create a text file, hw2_input.txt with enough test sentences, each on a separate line, to demonstrate that your grammar works correctly, and then run your runGramma0r2.py code.

Step by Step Solution

There are 3 Steps involved in it

1 Expert Approved Answer
Step: 1 Unlock blur-text-image
Question Has Been Solved by an Expert!

Get step-by-step solutions from verified subject matter experts

Step: 2 Unlock
Step: 3 Unlock

Students Have Also Explored These Related Databases Questions!