001 // This file is part of the Attempto Java Packages.
002 // Copyright 2008, Attempto Group, University of Zurich (see http://attempto.ifi.uzh.ch).
003 //
004 // The Attempto Java Packages is free software: you can redistribute it and/or modify it under the
005 // terms of the GNU Lesser General Public License as published by the Free Software Foundation,
006 // either version 3 of the License, or (at your option) any later version.
007 //
008 // The Attempto Java Packages is distributed in the hope that it will be useful, but WITHOUT ANY
009 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
010 // PURPOSE. See the GNU Lesser General Public License for more details.
011 //
012 // You should have received a copy of the GNU Lesser General Public License along with the Attempto
013 // Java Packages. If not, see http://www.gnu.org/licenses/.
014
015 package ch.uzh.ifi.attempto.chartparser;
016
017 import java.util.ArrayList;
018 import java.util.HashMap;
019
020 /**
021 * This class represents a grammar that is needed to run the chart parser. A grammar can be created
022 * either directly in Java or on the basis of a file in the ACGN format.
023 *
024 * <h4>ACGN Format</h4>
025 *
026 * ACGN stands for "Attempto Chartparser Grammar Notation" and uses Prolog notation to provide a nice
027 * grammar representation. Simple grammar rules in ACGN look almost the same as common Prolog DCG rules.
028 * Just replace the operator
029 * "<code>--></code>" by "<code>=></code>":
030 *<blockquote><pre>
031 * vp => v, np.
032 * v => [does, not], verb.
033 *</pre></blockquote>
034 * Complex grammar rules in ACGN are different from common Prolog DCG rules in the sense that they are using
035 * features rather than arguments with fixed positions. Arguments are not recognized by their position but by their
036 * name:
037 *<blockquote><pre>
038 * vp(num:Num,neg:Neg) => v(num:Num,neg:Neg,type:tr), np(case:acc).
039 * v(neg:plus,type:Type) => [does, not], verb(type:Type).
040 *</pre></blockquote>
041 * Every feature has the form <code>Name:Value</code> where <code>Name</code> has to be an atom and <code>Value</code>
042 * can be a variable or an atom (but not a compound term).
043 *<p>
044 * ACGN provides special support for anaphoric references which are used in (controlled) natural languages to refer
045 * to objects earlier in the sentence. For example, in the sentence
046 *<blockquote><i>
047 * A country contains an area that is not controlled by the country.
048 *</i></blockquote>
049 * the anaphoric reference "the country" refers to the antecedent "a country". Anaphoric references should be
050 * introduced only if the previous text contains a matching antecedent that is accessible. For example, in the case
051 * of the partial sentence
052 *<blockquote><i>
053 * A country does not contain a river and borders ...
054 *</i></blockquote>
055 * one can refer to "a country", but not to "a river" because being in the scope of a negation makes it inaccessible.
056 *<p>
057 * In order to define the accessibility information needed for anaphoric references in a declarative way, we
058 * distinguish two types of grammar rules: accessible rules "<code>=></code>" and inaccessible rules "<code>~></code>".
059 * The following example shows an inaccessible rule:
060 *<blockquote><pre>
061 * vp(num:Num,neg:plus) ~> v(num:Num,neg:plus,type:tr), np(case:acc).
062 *</pre></blockquote>
063 * Inaccessible rules are handled in the same way as accessible rules with the only exception that the components
064 * that are in the scope of the rule are not accessible for subsequent anaphoric references.
065 *<p>
066 * This can be visualized by the introduction of a special node "~" in the syntax tree whenever an
067 * inaccessible rule is used. For the partial sentence introduced before, the syntax tree could look as follows:
068 *<p><center>
069 * <img src="doc-files/tree.jpg" width="350" alt="example syntax tree">
070 *</center><p>
071 * In this case, several accessible rules and exactly one inaccessible rule (indicated by the "~"-node)
072 * have been used. All preceding components that can be reached through the syntax tree without traversing a
073 * "~"-node in the top-down direction are accessible. Thus, "a country" is accessible from the position
074 * "*", but "a river'' is not. Furthermore, "a country" would be accessible from the position of "a river" because
075 * the "~"-node is in this case traversed only in the bottum-up direction.
076 *<p>
077 * The described procedure allows us to determine all possible anaphoric references that can be used to continue a
078 * partial sentence. In our example, one can refer only to "a country".
079 * The concept of accessible and inaccessible rules is a simple but powerful instrument to define in a declarative
080 * way the accessibility constraints for anaphoric references.
081 *<p>
082 * The information about which tokens are accessible for anaphoric
083 * references can be retrieved by the method {@link ChartParser#getAccessiblePositions(String) getAccessiblePositions}
084 * of the {@link ChartParser} class.
085 *
086 * <h4>Transformations</h4>
087 *
088 * ACGN grammars can be translated automatically into a Java class or into a Prolog DCG using the SWI Prolog programs
089 * "generate_java.pl" or "generate_dcg.pl", respectively. Those programs can be found in the directory
090 * "src/ch/uzh/ifi/attempto/utils" of the source code of this package. The Java class can be generated like this:
091 *<blockquote><pre>
092 * swipl -s generate_java.pl -g "generate_java('my_acgn_grammar.pl', 'my.package', 'MyJavaGrammar', 'my_start_category')" -t halt
093 *</pre></blockquote>
094 * Note that the SWI Prolog command might be different on your machine (e.g. "<code>plcon</code>" or "<code>pl</code>").
095 * The Prolog DCG file can be generated like this:
096 *<blockquote><pre>
097 * swipl -s generate_dcg.pl -g "generate_dcg('my_acgn_grammar.pl', 'my_dcg_grammar.pl')" -t halt
098 *</pre></blockquote>
099 * Note that the information about accessible and inaccessible rules gets lost in the Prolog DCG file.
100 *
101 * @author Tobias Kuhn
102 */
103 public class Grammar {
104
105 private final Nonterminal startCategory;
106 private ArrayList<Rule> rules = new ArrayList<Rule>();
107 private HashMap<String, ArrayList<Rule>> rulesByHeadName = new HashMap<String, ArrayList<Rule>>();
108 private ArrayList<Rule> epsilonRules = new ArrayList<Rule>();
109
110 /**
111 * Creates a empty grammar with the given start category.
112 *
113 * @param startCategory The start category for the grammar.
114 */
115 public Grammar(Nonterminal startCategory) {
116 this.startCategory = startCategory;
117 }
118
119 /**
120 * Creates a empty grammar with a start category of the given name.
121 *
122 * @param startCategoryName The name of the start category for the grammar.
123 */
124 public Grammar(String startCategoryName) {
125 this.startCategory = new Nonterminal(startCategoryName);
126 }
127
128 /**
129 * Returns the start category.
130 *
131 * @return The start category.
132 */
133 public Nonterminal getStartCategory() {
134 return startCategory;
135 }
136
137 /**
138 * Adds the rule to the grammar.
139 *
140 * @param rule The rule to be added.
141 */
142 public void addRule(Rule rule) {
143 rules.add(rule);
144 ArrayList<Rule> l = rulesByHeadName.get(rule.getHead().getName());
145 if (l == null) {
146 l = new ArrayList<Rule>();
147 l.add(rule);
148 rulesByHeadName.put(rule.getHead().getName(), l);
149 } else {
150 l.add(rule);
151 }
152 rulesByHeadName.get(rule.getHead().getName()).add(rule);
153 if (rule.hasEmptyBody()) {
154 epsilonRules.add(rule);
155 }
156 }
157
158 /**
159 * Returns the rules whose head category has the given name.
160 *
161 * @param name The name of the head category.
162 * @return A list of rules.
163 */
164 public ArrayList<Rule> getRulesByHeadName(String name) {
165 ArrayList<Rule> l = rulesByHeadName.get(name);
166 if (l != null) {
167 return l;
168 }
169 return new ArrayList<Rule>();
170 }
171
172 /**
173 * Returns all the rules that have no body categories.
174 *
175 * @return A list of rules.
176 */
177 public ArrayList<Rule> getEpsilonRules() {
178 return epsilonRules;
179 }
180
181 public String toString() {
182 String s = "";
183 for (Rule r : rules) {
184 s += r + "\n";
185 }
186 return s;
187 }
188
189 }