package morphology.parser; import java.io.*; import java.util.*; import java.util.regex.*; /** * An Experimental Morphological Analyzer of Latvian. * Limited to inflectional morphology of nouns. * Techniques used: two-level morphology and finite-state transducers. * * Project: NGSLT --> NLP --> Words --> Assignment #1 * @author Normunds Grūzītis, Gunta Nešpore, Baiba Saulīte * @version February-March 2006 */ public class Analyzer { private ArrayList patterns; private SortedMap stems; public static final int MORPHOPHONOLOGY = 1; public static final int MORPHOTACTICS = 2; /** * Constructor. * @param properties configuration file describing location of a database */ public Analyzer(File properties) throws IOException { Properties config = Utility.readConfiguration(properties); //Read list of transducer rules. patterns = Utility.readRegExRules(new File(config.getProperty("rules"))); //Read list of stems and create search tree. stems = Utility.readNounStems(new File(config.getProperty("noun_stems"))); } /** * Transforms given word form from surface level into lexical level representation. * @param word word form to parse. * @return list of possible morphological representations of the given form. */ public ArrayList parse(String word) { ArrayList output = null; ArrayList forms = new ArrayList(); Iterator itForms = null; Iterator itRules = null; RegExRule rule = null; Pattern pIf = null; Matcher mIf = null; String wf = ""; String stem = ""; boolean normalized_form = false; //Create a list of given word form + all the possible morphophonological alternation variants. forms.add(word); itRules = patterns.iterator(); while (itRules.hasNext()){ rule = (RegExRule)itRules.next(); if (rule.getOrder() != MORPHOPHONOLOGY) continue; pIf = Pattern.compile(rule.getIF(), Pattern.CASE_INSENSITIVE); mIf = pIf.matcher(word); //If word form is accepted having potential alternation... if (mIf.matches()) { //Transduce input into intermediate (normalized) representation. forms.add(mIf.replaceFirst(rule.getTHEN())); } } //Searching for stem. itForms = forms.iterator(); while (itForms.hasNext()) { wf = (String)itForms.next(); stem = ""; for (int i = 0; i < wf.length(); i++) { //Read char by char from the surface type and look-up for corresponding stem in the stem tree. stem = stem + wf.charAt(i); if (stems.containsKey(stem)) { //Remaining part of the word form has to be legal ending for the paradigm, //which is associated with the stem in the lexicon. //If there is no more char on the surface type, an empty string is associated with ending. //Nevertheless search for longer stem if there is more than one char left on surface type. String ending = ""; if (i < wf.length()) ending = wf.substring(i + 1); //Read morphological features assigned to the stem. ArrayList features = (ArrayList)stems.get(stem); Iterator itFeatures = features.iterator(); while (itFeatures.hasNext()) { NounStemFeature nsf = (NounStemFeature)itFeatures.next(); String decl = nsf.getParadigm(); //Hard-coded knowledge about which declensions allows morphophonological alternations. //Temporary implementation to resolve issues like: //1. {kārt,D5}&{kārt,D7}; //2. kāršu --normalization--> kārtu; //3. D5 F SG ACC & D5 F PL GEN <-- FALSE! if (normalized_form && !(decl.equals("D3") || decl.equals("D6") || decl.equals("D7"))) continue; //For each IF-THEN rule in morphotactics stage... itRules = patterns.iterator(); while (itRules.hasNext()){ rule = (RegExRule)itRules.next(); if (rule.getOrder() != MORPHOTACTICS) continue; //Replace all according stem references with the actual stem. String r_if = rule.getIF().replaceAll(decl, stem); pIf = Pattern.compile(r_if, Pattern.CASE_INSENSITIVE); mIf = pIf.matcher(stem + ending); //If content of surface type is accepted... if (mIf.matches()) { //Transduce input into lexical representation. if (output == null) output = new ArrayList(); output.add(mIf.replaceFirst(rule.getTHEN())); } } } } } normalized_form = true; } return output; } }