001 // This file is part of the Attempto Java Packages. 002 // Copyright 2008-2009, Attempto Group, University of Zurich (see http://attempto.ifi.uzh.ch). 003 // 004 // The Attempto Java Packages is free software: you can redistribute it and/or modify it under the 005 // terms of the GNU Lesser General Public License as published by the Free Software Foundation, 006 // either version 3 of the License, or (at your option) any later version. 007 // 008 // The Attempto Java Packages is distributed in the hope that it will be useful, but WITHOUT ANY 009 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR 010 // PURPOSE. See the GNU Lesser General Public License for more details. 011 // 012 // You should have received a copy of the GNU Lesser General Public License along with the Attempto 013 // Java Packages. If not, see http://www.gnu.org/licenses/. 014 015 package ch.uzh.ifi.attempto.chartparser; 016 017 import java.util.ArrayList; 018 import java.util.HashMap; 019 import java.util.List; 020 021 /** 022 * This is a chart parser (concretely an Earley parser) fully implemented in Java. However, there is a 023 * Prolog format ("Attempto Chartparser Grammar Notation" or "ACGN") that can be transformed into Java 024 * (at compile time). 025 * 026 * @author Tobias Kuhn 027 * @see Grammar 028 */ 029 public class ChartParser { 030 031 private final Grammar grammar; 032 private final Chart chart = new Chart(); 033 private final ArrayList<Object> tokens = new ArrayList<Object>(); 034 private boolean debug; 035 036 /** 037 * Creates a new chart parser for the given grammar. The grammar must not be changed afterwards. 038 * 039 * @param grammar The grammar to be used by the chart parser. 040 */ 041 public ChartParser(Grammar grammar) { 042 this.grammar = grammar; 043 init(grammar.getStartCategory()); 044 completeAndPredict(); 045 } 046 047 /** 048 * This method can be used to switch on/off debug mode (default is off). In debug mode, messages about the actions 049 * of the chart parser are printed onto the standard error device. 050 * 051 * @param debug true to switch on debug mode or false to switch it off. 052 */ 053 public void debug(boolean debug) { 054 this.debug = debug; 055 } 056 057 /** 058 * Adds the token to the token sequence and makes one more parsing step to process it. If several tokens 059 * are given then these tokens are treated in a parallel way representing a token that can have multiple 060 * forms. 061 * 062 * @param token The token to be added to the token sequence. 063 */ 064 public void addToken(Terminal... token) { 065 if (token.length == 1) { 066 tokens.add(token[0]); 067 } else { 068 tokens.add(token); 069 } 070 for (Terminal t : token) { 071 Edge edge = new Edge(tokens.size()-1, tokens.size(), t, true); 072 chart.addEdge(edge); 073 if (debug) System.err.println("SCANNER: " + edge); 074 } 075 completeAndPredict(); 076 //if (debug) System.err.println("CHART:"); 077 //if (debug) System.err.println(chart); 078 } 079 080 /** 081 * Adds the tokens to the token sequence and processes them. 082 * 083 * @param tokens The tokens to be added to the token sequence. 084 */ 085 public void addTokens(List<Terminal> tokens) { 086 for (Terminal t : tokens) { 087 addToken(t); 088 } 089 } 090 091 /** 092 * Removes the last token and reverts the last parsing step. 093 */ 094 public void removeToken() { 095 chart.removeEdgesWithEndPos(tokens.size()); 096 tokens.remove(tokens.size()-1); 097 } 098 099 /** 100 * Removes all tokens in the current token sequence and resets the chart. 101 */ 102 public void removeAllTokens() { 103 tokens.clear(); 104 chart.clear(); 105 init(grammar.getStartCategory()); 106 completeAndPredict(); 107 } 108 109 /** 110 * Returns the current token sequence. The elements of the list are strings (in the normal case) 111 * or arrays of strings (in the case of parallel tokens). 112 * 113 * @return The current token sequence. 114 */ 115 public List<Object> getTokens() { 116 return new ArrayList<Object>(tokens); 117 } 118 119 /** 120 * Returns all tokens that are allowed to follow the current token sequence according to the grammar. 121 * 122 * @return The possible next tokens. 123 */ 124 public List<Terminal> nextTokens() { 125 ArrayList<Terminal> terminals = new ArrayList<Terminal>(); 126 if (debug) System.err.print("LOOKING FORWARD:"); 127 for (Edge e : chart.getEdgesByEndPos(tokens.size(), true)) { 128 if (!e.isActive()) continue; 129 if (!(e.getNextActive() instanceof Terminal)) continue; 130 131 Terminal t = (Terminal) e.getNextActive(); 132 boolean isNew = true; 133 for (Terminal knownTerminal : new ArrayList<Terminal>(terminals)) { 134 if (t.subsumes(knownTerminal)) { 135 terminals.remove(knownTerminal); 136 } else if (knownTerminal.subsumes(t)) { 137 isNew = false; 138 break; 139 } 140 } 141 if (isNew) { 142 terminals.add(t); 143 } 144 } 145 ArrayList<Terminal> copiedTerminals = new ArrayList<Terminal>(); 146 for (Terminal t : terminals) { 147 copiedTerminals.add((Terminal) t.deepCopy()); 148 if (debug) System.err.print(" " + t); 149 } 150 if (debug) System.err.println(); 151 152 return copiedTerminals; 153 } 154 155 /** 156 * Returns a boolean array that describes which of the tokens of the current token sequence are 157 * accessible (true) and which are not (false) for the given next token. 158 * 159 * @param nextToken The next token for which the tokens should be checked for accessibility. 160 * @return A boolean array where each element stands for one token of the token sequence. 161 */ 162 public boolean[] getAccessiblePositions(String nextToken) { 163 boolean[] pos = new boolean[tokens.size()]; 164 ArrayList<ArrayList<Edge>> paths = new ArrayList<ArrayList<Edge>>(); 165 166 for (Edge e : chart.getEdgesByEndPosAndActCat(tokens.size(), nextToken, false)) { 167 if (!e.isActive()) continue; 168 169 ArrayList<Edge> partialPath = new ArrayList<Edge>(); 170 partialPath.add(e.deepCopy()); 171 collectActivePaths(0, partialPath, paths); 172 } 173 174 for (ArrayList<Edge> path : paths) { 175 for (Edge edge : path) { 176 scanForAccessiblePositions(edge, pos, new ArrayList<Edge>()); 177 } 178 } 179 180 return pos; 181 } 182 183 private void scanForAccessiblePositions(Edge edge, boolean[] pos, ArrayList<Edge> visitedEdges) { 184 if (edge.getStartPos() == edge.getEndPos()) return; 185 if (edge.getBody().length == 0 && edge.getHead() instanceof Terminal) { 186 pos[edge.getStartPos()] = true; 187 return; 188 } 189 190 for (Edge e : edge.getLinks()) { 191 if (!e.isAccessible()) continue; 192 if (visitedEdges.contains(e)) continue; 193 visitedEdges.add(e); 194 scanForAccessiblePositions(e, pos, visitedEdges); 195 } 196 } 197 198 private void collectActivePaths(int startPos, ArrayList<Edge> partialPath, ArrayList<ArrayList<Edge>> paths) { 199 Edge edge = partialPath.get(partialPath.size()-1); 200 if (edge.getStartPos() == startPos) { 201 paths.add(partialPath); 202 return; 203 } 204 if (edge.getStartPos() < startPos) { 205 return; 206 } 207 ArrayList<Edge> edgesToCheck = new ArrayList<Edge>(); 208 for (Edge e : chart.getEdgesByEndPosAndActCat(edge.getStartPos(), edge.getHead().getName(), false)) { 209 Category[] newBody = new Category[e.getProgress()+1]; 210 System.arraycopy(e.getBody(), 0, newBody, 0, e.getProgress()+1); 211 Edge ec = new Edge(e.getStartPos(), e.getEndPos(), e.getHead(), newBody, e.getProgress(), true); 212 ec.addLinksFrom(e); 213 boolean isNew = true; 214 for (Edge ee : edgesToCheck) { 215 if (ee.subsumes(ec)) { 216 isNew = false; 217 break; 218 } 219 } 220 if (isNew) { 221 for (Edge ee : new ArrayList<Edge>(edgesToCheck)) { 222 if (ec.subsumes(ee)) { 223 edgesToCheck.remove(ee); 224 } 225 } 226 edgesToCheck.add(ec); 227 } 228 } 229 for (Edge e : edgesToCheck) { 230 try { 231 Edge eC = e.deepCopy(); 232 ArrayList<Edge> newPartialPath = copyEdgeList(partialPath); 233 Edge newEdge = newPartialPath.get(partialPath.size()-1); 234 newEdge.getHead().unify(eC.getNextActive()); 235 newPartialPath.add(eC); 236 collectActivePaths(startPos, newPartialPath, paths); 237 } catch (UnificationFailedException ex) { 238 continue; 239 } 240 } 241 } 242 243 private ArrayList<Edge> copyEdgeList(ArrayList<Edge> edgeList) { 244 ArrayList<Edge> edgeListCopy = new ArrayList<Edge>(); 245 HashMap<Integer, StringEntity> entities = new HashMap<Integer, StringEntity>(); 246 for (Edge edge : edgeList) { 247 edgeListCopy.add(edge.deepCopy(entities)); 248 } 249 return edgeListCopy; 250 } 251 252 private void init(Nonterminal category) { 253 for (Rule rule : grammar.getRulesByHeadName(category.getName())) { 254 try { 255 Nonterminal categoryC = (Nonterminal) category.deepCopy(); 256 Rule ruleC = rule.deepCopy(); 257 categoryC.unify(ruleC.getHead()); 258 Edge edge = new Edge(0, 0, ruleC.getHead(), ruleC.getBody(), ruleC.isAccessible()); 259 chart.addEdge(edge); 260 if (debug) System.err.println("INIT: " + rule + " >>> " + edge); 261 } catch (UnificationFailedException ex) { 262 continue; 263 } 264 } 265 } 266 267 private void completeAndPredict() { 268 int c = 0; 269 do { 270 complete(); 271 c = predict(); 272 } while (c > 0); 273 } 274 275 private int predict() { 276 int count = 0; 277 for (Edge e : chart.getEdgesByEndPos(tokens.size(), true)) { 278 Category cat = e.getNextActive(); 279 if (cat == null) continue; 280 281 if (debug) System.err.println("PREDICT FOR EDGE: " + e); 282 int subcount = predict(cat); 283 count += subcount; 284 } 285 for (Rule rule : grammar.getEpsilonRules()) { 286 Edge edge = new Edge(tokens.size(), tokens.size(), rule.getHead(), rule.isAccessible()); 287 boolean isNewEdge = chart.addEdge(edge); 288 if (isNewEdge) { 289 count++; 290 if (debug) System.err.println("EDGE FOR EPSILON RULE: " + edge); 291 } 292 } 293 return count; 294 } 295 296 private int predict(Category category) { 297 int count = 0; 298 for (Rule rule : grammar.getRulesByHeadName(category.getName())) { 299 if (rule.hasEmptyBody()) continue; 300 301 try { 302 Category categoryC = category.deepCopy(); 303 Rule ruleC = rule.deepCopy(); 304 categoryC.unify(ruleC.getHead()); 305 int p = tokens.size(); 306 Edge edge = new Edge(p, p, ruleC.getHead(), ruleC.getBody(), ruleC.isAccessible()); 307 if (debug) System.err.println("PREDICTOR: " + rule + " >>> " + edge); 308 boolean isNewEdge = chart.addEdge(edge); 309 if (isNewEdge) { 310 if (debug) System.err.println("PREDICT FOR EDGE: " + edge); 311 int subcount = predict(edge.getNextActive()); 312 count += subcount + 1; 313 } 314 } catch (UnificationFailedException ex) { 315 continue; 316 } 317 } 318 return count; 319 } 320 321 private void complete() { 322 for (Edge e : chart.getEdgesByEndPosAndActCat(tokens.size(), null, true)) { 323 if (debug) System.err.println("COMPLETE FOR EDGE: " + e); 324 complete(e.getStartPos(), e); 325 } 326 } 327 328 private void complete(int pos, Edge passiveEdge) { 329 Category category = passiveEdge.getHead(); 330 331 for (Edge edge : chart.getEdgesByEndPosAndActCat(pos, category.getName(), pos == tokens.size())) { 332 if (!edge.isActive()) continue; 333 334 try { 335 Category categoryC = category.deepCopy(); 336 Edge edgeC = edge.deepCopy(); 337 categoryC.unify(edgeC.getNextActive()); 338 edgeC.step(tokens.size(), passiveEdge); 339 if (debug) System.err.println("COMPLETOR: " + edge + " >>> " + edgeC); 340 boolean isNewEdge = chart.addEdge(edgeC); 341 if (isNewEdge && !edgeC.isActive()) { 342 complete(edgeC.getStartPos(), edgeC); 343 } 344 } catch (UnificationFailedException ex) { 345 continue; 346 } 347 } 348 } 349 350 }