001    // This file is part of the Attempto Java Packages.
002    // Copyright 2008, Attempto Group, University of Zurich (see http://attempto.ifi.uzh.ch).
003    //
004    // The Attempto Java Packages is free software: you can redistribute it and/or modify it under the
005    // terms of the GNU Lesser General Public License as published by the Free Software Foundation,
006    // either version 3 of the License, or (at your option) any later version.
007    //
008    // The Attempto Java Packages is distributed in the hope that it will be useful, but WITHOUT ANY
009    // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
010    // PURPOSE. See the GNU Lesser General Public License for more details.
011    //
012    // You should have received a copy of the GNU Lesser General Public License along with the Attempto
013    // Java Packages. If not, see http://www.gnu.org/licenses/.
014    
015    package ch.uzh.ifi.attempto.chartparser;
016    
017    import java.util.ArrayList;
018    import java.util.HashMap;
019    
020    /**
021     * This class represents a grammar that is needed to run the chart parser. A grammar can be created
022     * either directly in Java or on the basis of a file in the ACGN format.
023     *
024     * <h4>ACGN Format</h4>
025     * 
026     * ACGN stands for "Attempto Chartparser Grammar Notation" and uses Prolog notation to provide a nice
027     * grammar representation. Simple grammar rules in ACGN look almost the same as common Prolog DCG rules.
028     * Just replace the operator
029     * "<code>--></code>" by "<code>=></code>":
030     *<blockquote><pre>
031     * vp => v, np.
032     * v => [does, not], verb.
033     *</pre></blockquote>
034     * Complex grammar rules in ACGN are different from common Prolog DCG rules in the sense that they are using
035     * features rather than arguments with fixed positions. Arguments are not recognized by their position but by their
036     * name:
037     *<blockquote><pre>
038     * vp(num:Num,neg:Neg) => v(num:Num,neg:Neg,type:tr), np(case:acc).
039     * v(neg:plus,type:Type) => [does, not], verb(type:Type).
040     *</pre></blockquote>
041     * Every feature has the form <code>Name:Value</code> where <code>Name</code> has to be an atom and <code>Value</code>
042     * can be a variable or an atom (but not a compound term).
043     *<p>
044     * ACGN provides special support for anaphoric references which are used in (controlled) natural languages to refer
045     * to objects earlier in the sentence. For example, in the sentence
046     *<blockquote><i>
047     * A country contains an area that is not controlled by the country.
048     *</i></blockquote>
049     * the anaphoric reference "the country" refers to the antecedent "a country". Anaphoric references should be
050     * introduced only if the previous text contains a matching antecedent that is accessible. For example, in the case
051     * of the partial sentence
052     *<blockquote><i>
053     * A country does not contain a river and borders ...
054     *</i></blockquote>
055     * one can refer to "a country", but not to "a river" because being in the scope of a negation makes it inaccessible.
056     *<p>
057     * In order to define the accessibility information needed for anaphoric references in a declarative way, we
058     * distinguish two types of grammar rules: accessible rules "<code>=></code>" and inaccessible rules "<code>~></code>".
059     * The following example shows an inaccessible rule:
060     *<blockquote><pre>
061     * vp(num:Num,neg:plus) ~> v(num:Num,neg:plus,type:tr), np(case:acc).
062     *</pre></blockquote>
063     * Inaccessible rules are handled in the same way as accessible rules with the only exception that the components
064     * that are in the scope of the rule are not accessible for subsequent anaphoric references.
065     *<p>
066     * This can be visualized by the introduction of a special node "~" in the syntax tree whenever an
067     * inaccessible rule is used. For the partial sentence introduced before, the syntax tree could look as follows:
068     *<p><center>
069     * <img src="doc-files/tree.jpg" width="350" alt="example syntax tree">
070     *</center><p>
071     * In this case, several accessible rules and exactly one inaccessible rule (indicated by the "~"-node)
072     * have been used. All preceding components that can be reached through the syntax tree without traversing a
073     * "~"-node in the top-down direction are accessible. Thus, "a country" is accessible from the position
074     * "*", but "a river'' is not. Furthermore, "a country" would be accessible from the position of "a river" because
075     * the "~"-node is in this case traversed only in the bottum-up direction.
076     *<p>
077     * The described procedure allows us to determine all possible anaphoric references that can be used to continue a
078     * partial sentence. In our example, one can refer only to "a country".
079     * The concept of accessible and inaccessible rules is a simple but powerful instrument to define in a declarative
080     * way the accessibility constraints for anaphoric references.
081     *<p>
082     * The information about which tokens are accessible for anaphoric
083     * references can be retrieved by the method {@link ChartParser#getAccessiblePositions(String) getAccessiblePositions}
084     * of the {@link ChartParser} class.
085     *
086     * <h4>Transformations</h4>
087     * 
088     * ACGN grammars can be translated automatically into a Java class or into a Prolog DCG using the SWI Prolog programs
089     * "generate_java.pl" or "generate_dcg.pl", respectively. Those programs can be found in the directory
090     * "src/ch/uzh/ifi/attempto/utils" of the source code of this package. The Java class can be generated like this:
091     *<blockquote><pre>
092     * swipl -s generate_java.pl -g "generate_java('my_acgn_grammar.pl', 'my.package', 'MyJavaGrammar', 'my_start_category')" -t halt
093     *</pre></blockquote>
094     * Note that the SWI Prolog command might be different on your machine (e.g. "<code>plcon</code>" or "<code>pl</code>").
095     * The Prolog DCG file can be generated like this:
096     *<blockquote><pre>
097     * swipl -s generate_dcg.pl -g "generate_dcg('my_acgn_grammar.pl', 'my_dcg_grammar.pl')" -t halt
098     *</pre></blockquote>
099     * Note that the information about accessible and inaccessible rules gets lost in the Prolog DCG file.
100     * 
101     * @author Tobias Kuhn
102     */
103    public class Grammar {
104            
105            private final Nonterminal startCategory;
106            private ArrayList<Rule> rules = new ArrayList<Rule>();
107            private HashMap<String, ArrayList<Rule>> rulesByHeadName = new HashMap<String, ArrayList<Rule>>();
108            private ArrayList<Rule> epsilonRules = new ArrayList<Rule>();
109            
110            /**
111             * Creates a empty grammar with the given start category.
112             * 
113             * @param startCategory The start category for the grammar.
114             */
115            public Grammar(Nonterminal startCategory) {
116                    this.startCategory = startCategory;
117            }
118            
119            /**
120             * Creates a empty grammar with a start category of the given name.
121             * 
122             * @param startCategoryName The name of the start category for the grammar.
123             */
124            public Grammar(String startCategoryName) {
125                    this.startCategory = new Nonterminal(startCategoryName);
126            }
127            
128            /**
129             * Returns the start category.
130             * 
131             * @return The start category.
132             */
133            public Nonterminal getStartCategory() {
134                    return startCategory;
135            }
136            
137            /**
138             * Adds the rule to the grammar.
139             * 
140             * @param rule The rule to be added.
141             */
142            public void addRule(Rule rule) {
143                    rules.add(rule);
144                    ArrayList<Rule> l = rulesByHeadName.get(rule.getHead().getName());
145                    if (l == null) {
146                            l = new ArrayList<Rule>();
147                            l.add(rule);
148                            rulesByHeadName.put(rule.getHead().getName(), l);
149                    } else {
150                            l.add(rule);
151                    }
152                    rulesByHeadName.get(rule.getHead().getName()).add(rule);
153                    if (rule.hasEmptyBody()) {
154                            epsilonRules.add(rule);
155                    }
156            }
157            
158            /**
159             * Returns the rules whose head category has the given name.
160             * 
161             * @param name The name of the head category.
162             * @return A list of rules.
163             */
164            public ArrayList<Rule> getRulesByHeadName(String name) {
165                    ArrayList<Rule> l = rulesByHeadName.get(name);
166                    if (l != null) {
167                            return l;
168                    }
169                    return new ArrayList<Rule>();
170            }
171            
172            /**
173             * Returns all the rules that have no body categories.
174             * 
175             * @return A list of rules.
176             */
177            public ArrayList<Rule> getEpsilonRules() {
178                    return epsilonRules;
179            }
180            
181            public String toString() {
182                    String s = "";
183                    for (Rule r : rules) {
184                            s += r + "\n";
185                    }
186                    return s;
187            }
188    
189    }