001 // This file is part of the Attempto Java Packages. 002 // Copyright 2008, Attempto Group, University of Zurich (see http://attempto.ifi.uzh.ch). 003 // 004 // The Attempto Java Packages is free software: you can redistribute it and/or modify it under the 005 // terms of the GNU Lesser General Public License as published by the Free Software Foundation, 006 // either version 3 of the License, or (at your option) any later version. 007 // 008 // The Attempto Java Packages is distributed in the hope that it will be useful, but WITHOUT ANY 009 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR 010 // PURPOSE. See the GNU Lesser General Public License for more details. 011 // 012 // You should have received a copy of the GNU Lesser General Public License along with the Attempto 013 // Java Packages. If not, see http://www.gnu.org/licenses/. 014 015 package ch.uzh.ifi.attempto.chartparser; 016 017 import java.util.ArrayList; 018 import java.util.HashMap; 019 020 /** 021 * This class represents a grammar that is needed to run the chart parser. A grammar can be created 022 * either directly in Java or on the basis of a file in the ACGN format. 023 * 024 * <h4>ACGN Format</h4> 025 * 026 * ACGN stands for "Attempto Chartparser Grammar Notation" and uses Prolog notation to provide a nice 027 * grammar representation. Simple grammar rules in ACGN look almost the same as common Prolog DCG rules. 028 * Just replace the operator 029 * "<code>--></code>" by "<code>=></code>": 030 *<blockquote><pre> 031 * vp => v, np. 032 * v => [does, not], verb. 033 *</pre></blockquote> 034 * Complex grammar rules in ACGN are different from common Prolog DCG rules in the sense that they are using 035 * features rather than arguments with fixed positions. Arguments are not recognized by their position but by their 036 * name: 037 *<blockquote><pre> 038 * vp(num:Num,neg:Neg) => v(num:Num,neg:Neg,type:tr), np(case:acc). 039 * v(neg:plus,type:Type) => [does, not], verb(type:Type). 040 *</pre></blockquote> 041 * Every feature has the form <code>Name:Value</code> where <code>Name</code> has to be an atom and <code>Value</code> 042 * can be a variable or an atom (but not a compound term). 043 *<p> 044 * ACGN provides special support for anaphoric references which are used in (controlled) natural languages to refer 045 * to objects earlier in the sentence. For example, in the sentence 046 *<blockquote><i> 047 * A country contains an area that is not controlled by the country. 048 *</i></blockquote> 049 * the anaphoric reference "the country" refers to the antecedent "a country". Anaphoric references should be 050 * introduced only if the previous text contains a matching antecedent that is accessible. For example, in the case 051 * of the partial sentence 052 *<blockquote><i> 053 * A country does not contain a river and borders ... 054 *</i></blockquote> 055 * one can refer to "a country", but not to "a river" because being in the scope of a negation makes it inaccessible. 056 *<p> 057 * In order to define the accessibility information needed for anaphoric references in a declarative way, we 058 * distinguish two types of grammar rules: accessible rules "<code>=></code>" and inaccessible rules "<code>~></code>". 059 * The following example shows an inaccessible rule: 060 *<blockquote><pre> 061 * vp(num:Num,neg:plus) ~> v(num:Num,neg:plus,type:tr), np(case:acc). 062 *</pre></blockquote> 063 * Inaccessible rules are handled in the same way as accessible rules with the only exception that the components 064 * that are in the scope of the rule are not accessible for subsequent anaphoric references. 065 *<p> 066 * This can be visualized by the introduction of a special node "~" in the syntax tree whenever an 067 * inaccessible rule is used. For the partial sentence introduced before, the syntax tree could look as follows: 068 *<p><center> 069 * <img src="doc-files/tree.jpg" width="350" alt="example syntax tree"> 070 *</center><p> 071 * In this case, several accessible rules and exactly one inaccessible rule (indicated by the "~"-node) 072 * have been used. All preceding components that can be reached through the syntax tree without traversing a 073 * "~"-node in the top-down direction are accessible. Thus, "a country" is accessible from the position 074 * "*", but "a river'' is not. Furthermore, "a country" would be accessible from the position of "a river" because 075 * the "~"-node is in this case traversed only in the bottum-up direction. 076 *<p> 077 * The described procedure allows us to determine all possible anaphoric references that can be used to continue a 078 * partial sentence. In our example, one can refer only to "a country". 079 * The concept of accessible and inaccessible rules is a simple but powerful instrument to define in a declarative 080 * way the accessibility constraints for anaphoric references. 081 *<p> 082 * The information about which tokens are accessible for anaphoric 083 * references can be retrieved by the method {@link ChartParser#getAccessiblePositions(String) getAccessiblePositions} 084 * of the {@link ChartParser} class. 085 * 086 * <h4>Transformations</h4> 087 * 088 * ACGN grammars can be translated automatically into a Java class or into a Prolog DCG using the SWI Prolog programs 089 * "generate_java.pl" or "generate_dcg.pl", respectively. Those programs can be found in the directory 090 * "src/ch/uzh/ifi/attempto/utils" of the source code of this package. The Java class can be generated like this: 091 *<blockquote><pre> 092 * swipl -s generate_java.pl -g "generate_java('my_acgn_grammar.pl', 'my.package', 'MyJavaGrammar', 'my_start_category')" -t halt 093 *</pre></blockquote> 094 * Note that the SWI Prolog command might be different on your machine (e.g. "<code>plcon</code>" or "<code>pl</code>"). 095 * The Prolog DCG file can be generated like this: 096 *<blockquote><pre> 097 * swipl -s generate_dcg.pl -g "generate_dcg('my_acgn_grammar.pl', 'my_dcg_grammar.pl')" -t halt 098 *</pre></blockquote> 099 * Note that the information about accessible and inaccessible rules gets lost in the Prolog DCG file. 100 * 101 * @author Tobias Kuhn 102 */ 103 public class Grammar { 104 105 private final Nonterminal startCategory; 106 private ArrayList<Rule> rules = new ArrayList<Rule>(); 107 private HashMap<String, ArrayList<Rule>> rulesByHeadName = new HashMap<String, ArrayList<Rule>>(); 108 private ArrayList<Rule> epsilonRules = new ArrayList<Rule>(); 109 110 /** 111 * Creates a empty grammar with the given start category. 112 * 113 * @param startCategory The start category for the grammar. 114 */ 115 public Grammar(Nonterminal startCategory) { 116 this.startCategory = startCategory; 117 } 118 119 /** 120 * Creates a empty grammar with a start category of the given name. 121 * 122 * @param startCategoryName The name of the start category for the grammar. 123 */ 124 public Grammar(String startCategoryName) { 125 this.startCategory = new Nonterminal(startCategoryName); 126 } 127 128 /** 129 * Returns the start category. 130 * 131 * @return The start category. 132 */ 133 public Nonterminal getStartCategory() { 134 return startCategory; 135 } 136 137 /** 138 * Adds the rule to the grammar. 139 * 140 * @param rule The rule to be added. 141 */ 142 public void addRule(Rule rule) { 143 rules.add(rule); 144 ArrayList<Rule> l = rulesByHeadName.get(rule.getHead().getName()); 145 if (l == null) { 146 l = new ArrayList<Rule>(); 147 l.add(rule); 148 rulesByHeadName.put(rule.getHead().getName(), l); 149 } else { 150 l.add(rule); 151 } 152 rulesByHeadName.get(rule.getHead().getName()).add(rule); 153 if (rule.hasEmptyBody()) { 154 epsilonRules.add(rule); 155 } 156 } 157 158 /** 159 * Returns the rules whose head category has the given name. 160 * 161 * @param name The name of the head category. 162 * @return A list of rules. 163 */ 164 public ArrayList<Rule> getRulesByHeadName(String name) { 165 ArrayList<Rule> l = rulesByHeadName.get(name); 166 if (l != null) { 167 return l; 168 } 169 return new ArrayList<Rule>(); 170 } 171 172 /** 173 * Returns all the rules that have no body categories. 174 * 175 * @return A list of rules. 176 */ 177 public ArrayList<Rule> getEpsilonRules() { 178 return epsilonRules; 179 } 180 181 public String toString() { 182 String s = ""; 183 for (Rule r : rules) { 184 s += r + "\n"; 185 } 186 return s; 187 } 188 189 }