001    // This file is part of the Attempto Java Packages.
002    // Copyright 2008-2009, Attempto Group, University of Zurich (see http://attempto.ifi.uzh.ch).
003    //
004    // The Attempto Java Packages is free software: you can redistribute it and/or modify it under the
005    // terms of the GNU Lesser General Public License as published by the Free Software Foundation,
006    // either version 3 of the License, or (at your option) any later version.
007    //
008    // The Attempto Java Packages is distributed in the hope that it will be useful, but WITHOUT ANY
009    // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
010    // PURPOSE. See the GNU Lesser General Public License for more details.
011    //
012    // You should have received a copy of the GNU Lesser General Public License along with the Attempto
013    // Java Packages. If not, see http://www.gnu.org/licenses/.
014    
015    package ch.uzh.ifi.attempto.chartparser;
016    
017    import java.util.ArrayList;
018    import java.util.HashMap;
019    
020    /**
021     * This class represents a grammar that is needed to run the chart parser. A grammar can be created
022     * either directly in Java or on the basis of a file in the Codeco format.
023     *
024     * <h4>Codeco Format</h4>
025     * 
026     * Codeco stands for "COncrete and DEclarative grammar notation for COntrolled natural languages" and uses Prolog
027     * notation to provide a nice grammar representation. Simple grammar rules in Codeco look almost the same as common
028     * Prolog DCG rules.
029     * Just replace the operator
030     * "<code>--></code>" by "<code>=></code>":
031     *<blockquote><pre>
032     * vp => v, np.
033     * v => [does, not], verb.
034     *</pre></blockquote>
035     * Note that terminal categories are in square brackets.
036     *<p>
037     * Complex grammar rules in Codeco are different from common Prolog DCG rules in the sense that they are using
038     * features rather than arguments with fixed positions. Arguments are not recognized by their position but by their
039     * name:
040     *<blockquote><pre>
041     * vp(num:Num,neg:Neg) => v(num:Num,neg:Neg,type:tr), np(case:acc).
042     * v(neg:plus,type:Type) => [does, not], verb(type:Type).
043     *</pre></blockquote>
044     * Every feature has the form <code>Name:Value</code> where <code>Name</code> has to be an atom and <code>Value</code>
045     * can be a variable or an atom (but not a compound term). Also terminal categories (in square brackets) can have features
046     * and they can be used as pre-terminals that are instantiated with the concrete word forms outside of the parser:
047     *<blockquote><pre>
048     * np => [noun(text:Noun)].
049     *</pre></blockquote>
050     *<p>
051     * Codeco provides special support for anaphoric references. Anaphoric references are used in (controlled) natural
052     * languages to refer to objects earlier in the sentence. For example, in the sentence
053     *<blockquote><i>
054     * A country contains an area that is not controlled by the country.
055     *</i></blockquote>
056     * the anaphoric reference "the country" refers to the antecedent "a country". In Codeco, anaphoric references are
057     * defined by the special categories "<code>>>></code>" and "<code><<<</code>". "<code>>>></code>" marks a
058     * position in the text to which anaphoric references can refer to (such positions are called "antecedents").
059     * "<code><<<</code>" refers back to the closed possible antecedent. An example is shown here:
060     *<blockquote><pre>
061     * np => [a, noun(text:Noun)], >>>(type:noun, noun:Noun).
062     * ref => [the, noun(text:Noun)], <<<(type:noun, noun:Noun).
063     *</pre></blockquote>
064     * Furthermore, the special category "<code>&lt;/&lt;</code>" can be used to ensure that there is no matching antecedent.
065     * This can be used for example for variables to ensure that no variable is defined twice:
066     *<blockquote><pre>
067     * np => [var(text:Var)], &lt;/&lt;(type:var, var:Var), >>>(type:var, var:Var).
068     * ref => [var(text:Var)], <<<(type:var, var:Var).
069     *</pre></blockquote>
070     * In order to be predicted correctly by the chartparser, the back-referring categories "<code><<<</code>" and
071     * "<code>&lt;/&lt;</code>" should be immediately preceded by a terminal category.
072     *<p>
073     * Anaphoric references are only allowed if the previous text contains a matching antecedent that is accessible.
074     * For example, in the case of the partial sentence
075     *<blockquote><i>
076     * A country does not contain a river and borders ...
077     *</i></blockquote>
078     * one can refer to "a country", but not to "a river" because being in the scope of a negation makes it inaccessible.
079     *<p>
080     * In order to define the accessibility constraints needed for anaphoric references, we
081     * distinguish two types of grammar rules: accessible rules "<code>=></code>" and inaccessible rules "<code>~></code>".
082     * The following example shows an inaccessible rule:
083     *<blockquote><pre>
084     * vp(num:Num,neg:plus) ~> v(num:Num,neg:plus,type:tr), np(case:acc).
085     *</pre></blockquote>
086     * Inaccessible rules are handled in the same way as accessible rules with the only exception that the components
087     * that are in the scope of the rule are not accessible for subsequent anaphoric references.
088     *<p>
089     * This can be visualized by the introduction of a special node "~" in the syntax tree whenever an
090     * inaccessible rule is used. For the partial sentence introduced before, the syntax tree could look as follows:
091     *<p><center>
092     * <img src="doc-files/tree.jpg" width="350" alt="example syntax tree">
093     *</center><p>
094     * In this case, several accessible rules and exactly one inaccessible rule (indicated by the "~"-node)
095     * have been used. All preceding components that can be reached through the syntax tree without traversing a
096     * "~"-node in the top-down direction are accessible. Thus, "a country" is accessible from the position
097     * "*", but "a river'' is not. Furthermore, "a country" would be accessible from the position of "a river" because
098     * the "~"-node is in this case traversed only in the bottum-up direction.
099     *
100     * <h4>Transformations</h4>
101     * 
102     * Codeco grammars can be translated automatically into a Java class or into a Prolog DCG using the SWI Prolog programs
103     * "generate_java.pl" or "generate_dcg.pl", respectively. Those programs can be found in the directory
104     * "src/ch/uzh/ifi/attempto/chartparser/util" of the source code of this package. The Java class can be generated like this:
105     *<blockquote><pre>
106     * swipl -s generate_java.pl -g "generate_java('my_codeco_grammar.pl', 'my.package', 'MyJavaGrammar', 'my_start_category')" -t halt
107     *</pre></blockquote>
108     * Note that the SWI Prolog command might be different on your machine (e.g. "<code>plcon</code>" or "<code>pl</code>").
109     * The Prolog DCG file can be generated like this:
110     *<blockquote><pre>
111     * swipl -s generate_dcg.pl -g "generate_dcg('my_codeco_grammar.pl', 'my_dcg_grammar.pl')" -t halt
112     *</pre></blockquote>
113     * 
114     * @author Tobias Kuhn
115     */
116    public class Grammar {
117            
118            private final Nonterminal startCategory;
119            private ArrayList<Rule> rules = new ArrayList<Rule>();
120            private HashMap<String, ArrayList<Rule>> rulesByHeadName = new HashMap<String, ArrayList<Rule>>();
121            private ArrayList<Rule> epsilonRules = new ArrayList<Rule>();
122            
123            /**
124             * Creates a empty grammar with the given start category.
125             * 
126             * @param startCategory The start category for the grammar.
127             */
128            public Grammar(Nonterminal startCategory) {
129                    this.startCategory = startCategory;
130            }
131            
132            /**
133             * Creates a empty grammar with a start category of the given name.
134             * 
135             * @param startCategoryName The name of the start category for the grammar.
136             */
137            public Grammar(String startCategoryName) {
138                    this.startCategory = new Nonterminal(startCategoryName);
139            }
140            
141            /**
142             * Returns the start category.
143             * 
144             * @return The start category.
145             */
146            public Nonterminal getStartCategory() {
147                    return startCategory;
148            }
149            
150            /**
151             * Adds the rule to the grammar.
152             * 
153             * @param rule The rule to be added.
154             */
155            public void addRule(Rule rule) {
156                    rules.add(rule);
157                    ArrayList<Rule> l = rulesByHeadName.get(rule.getHead().getName());
158                    if (l == null) {
159                            l = new ArrayList<Rule>();
160                            l.add(rule);
161                            rulesByHeadName.put(rule.getHead().getName(), l);
162                    } else {
163                            l.add(rule);
164                    }
165                    rulesByHeadName.get(rule.getHead().getName()).add(rule);
166                    if (rule.hasEmptyBody()) {
167                            epsilonRules.add(rule);
168                    }
169            }
170            
171            /**
172             * Returns the rules whose head category has the given name.
173             * 
174             * @param name The name of the head category.
175             * @return A list of rules.
176             */
177            public ArrayList<Rule> getRulesByHeadName(String name) {
178                    ArrayList<Rule> l = rulesByHeadName.get(name);
179                    if (l != null) {
180                            return l;
181                    }
182                    return new ArrayList<Rule>();
183            }
184            
185            /**
186             * Returns all the rules that have no body categories.
187             * 
188             * @return A list of rules.
189             */
190            public ArrayList<Rule> getEpsilonRules() {
191                    return epsilonRules;
192            }
193            
194            public String toString() {
195                    String s = "";
196                    for (Rule r : rules) {
197                            s += r + "\n";
198                    }
199                    return s;
200            }
201    
202    }