Allow user to pass scan probability multiplier, resolves #3

digitalheir · Jan 8, 2017 · 59d42b9 · 59d42b9
1 parent b47b3db
commit 59d42b9
Show file tree

Hide file tree

Showing 4 changed files with 61 additions and 37 deletions.
diff --git a/src/earley/parser.ts b/src/earley/parser.ts
@@ -88,10 +88,10 @@ export function getViterbiParseFromChart<S, T>(state: State<S, T>, chart: Chart<
 }
 
 
-
 export function parseSentenceIntoChart<S, T>(Start: NonTerminal,
                                              grammar: Grammar<T, S>,
-                                             tokens: T[]): [Chart<T, S>, number, State<S, T>] {
+                                             tokens: T[],
+                                             scanProbability?: (x: T, t: Terminal<T>[]) => S): [Chart<T, S>, number, State<S, T>] {
     // ScanProbability scanProbability//TODO
 
     const stateSets: Chart<T, S> = new Chart(grammar);
@@ -125,7 +125,7 @@ export function parseSentenceIntoChart<S, T>(Start: NonTerminal,
     tokensWithWords.forEach(
         (token: WordWithTypes<T>) => {
             predict(i, grammar, stateSets);
-            scan(i, token, grammar.probabilityMapping.semiring, stateSets);
+            scan(i, token, grammar.probabilityMapping.semiring, stateSets, scanProbability);
             complete(i + 1, stateSets, grammar);
 
             const completedStates: State<S, T>[] = [];
@@ -153,8 +153,13 @@ export interface ParseTreeWithScore<T> {
 
 export function getViterbiParse<S, T>(Start: NonTerminal,
                                       grammar: Grammar<T, S>,
-                                      tokens: T[]): ParseTreeWithScore<T> {
-    const [chart, ignored, init] = parseSentenceIntoChart(Start, grammar, tokens);
+                                      tokens: T[],
+                                      scanProbability?: (x: T, t: Terminal<T>[]) => S): ParseTreeWithScore<T> {
+    const [chart, ignored, init] = parseSentenceIntoChart(Start, grammar, tokens, scanProbability);
+
+    if (!chart.has(init.rule, tokens.length,
+            0,
+            init.rule.right.length)) throw new Error("Could not parse sentence.");
 
     const finalState = chart.getOrCreate(
         tokens.length,
@@ -163,6 +168,7 @@ export function getViterbiParse<S, T>(Start: NonTerminal,
         init.rule
     );
 
+
     const parseTree: ParseTree<T> = getViterbiParseFromChart(finalState, chart);
     const toProbability = grammar.probabilityMapping.toProbability;
     const finalScore = chart.getViterbiScore(finalState).innerScore;

diff --git a/src/earley/scan.ts b/src/earley/scan.ts
@@ -1,4 +1,4 @@
-import {isNonTerminal, WordWithTypes} from "../grammar/category";
+import {isNonTerminal, WordWithTypes, Terminal} from "../grammar/category";
 import {Semiring} from "semiring";
 import {Chart} from "./chart/chart";
 import {getActiveCategory, State, advanceDot} from "./chart/state";
@@ -10,19 +10,18 @@ import {getActiveCategory, State, advanceDot} from "./chart/state";
  * @param tokenPosition   The start index of the scan.
  * @param word
  * @param types
- * //@param scanProbability Function that provides the probability of scanning the given token at this position. Might be null for a probability of 1.0.
+ * @param scanProbability Function that provides the probability of scanning the given token at this position. Might be null for a probability of 1.0.
  * @param sr
  * @param stateSets
  */
 export function scan<S, T>(tokenPosition: number,
     {word, types}: WordWithTypes<T>,
-                           // scanProbability:(x:T)=>number,//TODO
                            sr: Semiring<S>,
-                           stateSets: Chart<T, S>) {
+                           stateSets: Chart<T, S>,
+                           scanProbability?: (x: T, t: Terminal<T>[]) => S) {
     const changes: any[] = [];
-    // TODO
-    // const scanProb:number = !scanProbability ? NaN : scanProbability(tokenPosition);
-    const scanProb: S = sr.multiplicativeIdentity;
+
+    const scanProb: S = !!scanProbability ? scanProbability(word, types) : undefined;
 
     /*
      * Get all states that are active on a terminal

diff --git a/test/earley/parser.spec.ts b/test/earley/parser.spec.ts
@@ -1,35 +1,31 @@
-import {NonTerminal, Terminal, Category} from "../../src/grammar/category";
+import {NonTerminal, Terminal} from "../../src/grammar/category";
 import {getViterbiParse, ParseTreeWithScore, Grammar} from "../../src/index";
 
-import * as Mocha from 'mocha'
-import {expect} from 'chai';
-import {scan} from "../../src/earley/scan";
-import {LogSemiring} from "semiring";
-import {Chart} from "../../src/earley/chart/chart";
+import {expect} from "chai";
 import {g, A} from "../sample-grammar";
 import {parseSentenceIntoChart} from "../../src/earley/parser";
 
-//TODO
-describe('parser', () => {
+// TODO
+describe("parser", () => {
 
 
-    it('should complete correctly', () => {
+    it("should complete correctly", () => {
         // complete(
         //     0,
         //     "e",
         //     LogSemiring,
         //     ss
         // )
     });
-    it('should predict correctly', () => {
+    it("should predict correctly", () => {
         // complete(
         //     0,
         //     "e",
         //     LogSemiring,
         //     ss
         // )
     });
-    it('should parse the man chase the man with a stick', () => {
+    it("should parse the man chase the man with a stick", () => {
         const S: NonTerminal = "S";
         const NP: NonTerminal = "NP";
         const VP: NonTerminal = "VP";
@@ -47,8 +43,8 @@ describe('parser', () => {
         const stick: Terminal<string> = (token) => !!token.match(/stick/);
         const with_: Terminal<string> = (token) => !!token.match(/with/);
 
-        const grammar: Grammar<string,number> = Grammar.builder("test")
-        //.setSemiring(new LogSemiring()) // If not set, defaults to Log semiring which is probably what you want
+        const grammar: Grammar<string, number> = Grammar.builder("test")
+        // .setSemiring(new LogSemiring()) // If not set, defaults to Log semiring which is probably what you want
             .addNewRule(
                 1.0,   // Probability between 0.0 and 1.0, defaults to 1.0. The builder takes care of converting it to the semiring element
                 S,     // Left hand side of the rule
@@ -88,18 +84,45 @@ describe('parser', () => {
             grammar,
             tokens
         );
-        //console.log(JSON.stringify(viterbi.parseTree)); // {"category":"<start>","children":[{"category":"S","children":[{"category":"NP","children":[{"category":"Det","children":[{"token":"The","children":[    ]}]},{"category":"N","children":[{"token":"man","children":[]}]}]},{"category":"VP","children":[{"category":"TV","children":[{"token":"chased","children":[]}]},{"category":"NP","children":[{"category":"Det","children":[{"token":"the","children":[]}]},{"category":"N","children":[{"token":"man","c        hildren":[]}]},{"category":"Mod","children":[{"token":"with","children":[]},{"category":"NP","children":[{"category":"Det","children":[{"token":"a",        "children":[]}]},{"category":"N","children":[{"token":"stick","children":[]}]}]}]}]}]}]}]}
-        //console.log(viterbi.probability); // 0.6
-        //Parser.recognize(S, grammar, Tokens.tokenize("the", "stick", "chased", "the", "man"))
+        // console.log(JSON.stringify(viterbi.parseTree)); // {"category":"<start>","children":[{"category":"S","children":[{"category":"NP","children":[{"category":"Det","children":[{"token":"The","children":[    ]}]},{"category":"N","children":[{"token":"man","children":[]}]}]},{"category":"VP","children":[{"category":"TV","children":[{"token":"chased","children":[]}]},{"category":"NP","children":[{"category":"Det","children":[{"token":"the","children":[]}]},{"category":"N","children":[{"token":"man","c        hildren":[]}]},{"category":"Mod","children":[{"token":"with","children":[]},{"category":"NP","children":[{"category":"Det","children":[{"token":"a",        "children":[]}]},{"category":"N","children":[{"token":"stick","children":[]}]}]}]}]}]}]}]}
+        // console.log(viterbi.probability); // 0.6
+        // Parser.recognize(S, grammar, Tokens.tokenize("the", "stick", "chased", "the", "man"))
     });
 
 
-it('should parse aaaaa', () => {
-        const tokens = ["a", "a", "a", "e"];
-        const [chart, i, init] = parseSentenceIntoChart(
+    const tokens = ["a", "a", "a", "e"];
+    it("should deal with scan probability correctly", () => {
+        const p1 = getViterbiParse(
             A,
             g,
-            tokens
+            tokens,
+            (ignore, ignored) => {
+                return g.probabilityMapping.fromProbability(1.0);
+            }
+        ).probability;
+
+        const p2 = getViterbiParse(
+            A,
+            g,
+            tokens,
+            (word, ignored) => {
+                return word === "a" ? g.probabilityMapping.fromProbability(0.5) : undefined;
+            }
+        ).probability;
+
+        const eq = p2 * 2 * 2 * 2;
+        const epsilon = 0.0000000000000001;
+        expect(p1).to.be.above(eq - epsilon).and.below(eq + epsilon);
+    });
+
+    it("should parse aaae", () => {
+        const [chart, ignored, init] = parseSentenceIntoChart(
+            A,
+            g,
+            tokens,
+            (word, terminalTypes) => {
+                return g.probabilityMapping.fromProbability(1.0);
+            }
         );
 
         expect(chart.getCompletedStates(tokens.length).has(
@@ -108,9 +131,5 @@ it('should parse aaaaa', () => {
             )
         )).to.equal(true);
 
-        /*console.log(g.probabilityMapping.toProbability(
-            chart.viterbiScores.get(chart.getOrCreate(
-                tokens.length, 0, init.rule.right.length, init.rule
-            )).innerScore));*/
     });
 });
diff --git a/test/sample-grammar.ts b/test/sample-grammar.ts
@@ -20,7 +20,7 @@ export const g:Grammar<string, number>  = builder
     .addNewRule(0.5, C, [D])
     .addNewRule(0.5, D, [E])
     .addNewRule(0.5, D, [a])
-    .addNewRule(0.5, E, [E,E])
+    .addNewRule(0.5, E, [E, E])
     .addNewRule(0.5, E, [e])
     //.addRule(0.1, E, [C])
     .build();