package morphology.parser; import java.io.*; import java.text.*; import java.util.*; /** * An Experimental Morphological Analyzer of Latvian. * Limited to inflectional morphology of nouns. * Techniques used: two-level morphology and finite-state transducers. * * Project: NGSLT --> NLP --> Words --> Assignment #1 * @author Normunds Grūzītis, Gunta Nešpore, Baiba Saulīte * @date February 2006 */ public class Analyzer { private SortedMap stems; private SortedMap endings; private SortedMap paradigms; /** * Reads configuration data from an external properties file. * @param properties pattern configuration file. * @return list of key-value pairs. */ private Properties readConfiguration(File properties) throws IOException { Properties config = new Properties(); FileInputStream fis = new FileInputStream(properties); InputStreamReader isr = new InputStreamReader(fis, "Cp1257"); BufferedReader reader = new BufferedReader(isr); String line = null; while ((line = reader.readLine()) != null) { line = line.trim(); //Valid configuration entry satisfies pattern "KEY=VALUE". if (!line.startsWith("#") && line.indexOf("=") > 0 && line.indexOf("=") < line.length() - 1) { String key = line.substring(0, line.indexOf("=")); String value = line.substring(line.indexOf("=") + 1); config.setProperty(key, value); } } reader.close(); return config; } /** * Reads table of noun stems into memory. * @param lex lexicon data file. */ private void readNounStems(File lex) throws IOException { FileInputStream fis = new FileInputStream(lex); InputStreamReader isr = new InputStreamReader(fis, "Cp1257"); BufferedReader reader = new BufferedReader(isr); String line = null; ArrayList features = null; while ((line = reader.readLine()) != null) { //entry[0] = stem; entry[1] = pos; entry[2] = paradigm. String[] entry = line.split("\t"); if (!stems.containsKey(entry[0])) stems.put(entry[0], new ArrayList()); features = (ArrayList)stems.get(entry[0]); features.add(new NounStemFeature(entry[1], entry[2])); stems.put(entry[0], features); } reader.close(); } /** * Reads table of noun endings into memory. * @param lex lexicon data file. */ private void readNounEndings(File lex) throws IOException { FileInputStream fis = new FileInputStream(lex); InputStreamReader isr = new InputStreamReader(fis, "Cp1257"); BufferedReader reader = new BufferedReader(isr); String line = null; ArrayList features = null; while ((line = reader.readLine()) != null) { //entry[0] = ending; entry[1] = paradigm; entry[2] = number; entry[3] = case. String[] entry = line.split("\t"); if (!endings.containsKey(entry[0])) endings.put(entry[0], new ArrayList()); features = (ArrayList)endings.get(entry[0]); features.add(new NounEndingFeature(entry[1], entry[2], entry[3])); endings.put(entry[0], features); } reader.close(); } /** * Reads table of noun declensions into memory. * @param lex lexicon data file. */ private void readDeclensions(File lex) throws IOException { FileInputStream fis = new FileInputStream(lex); InputStreamReader isr = new InputStreamReader(fis, "Cp1257"); BufferedReader reader = new BufferedReader(isr); String line = null; while ((line = reader.readLine()) != null) { //entry[0] = name; entry[1] = gender. String[] entry = line.split("\t"); paradigms.put(entry[0], entry[1]); } reader.close(); } /** * Constructor. * @param properties configuration file describing location of a database */ public Analyzer(File properties) throws IOException { Properties config = readConfiguration(properties); stems = new TreeMap(Collator.getInstance(new Locale("lv", "LV"))); endings = new TreeMap(Collator.getInstance(new Locale("lv", "LV"))); paradigms = new TreeMap(); //Read database and create search tree. readNounStems(new File(config.getProperty("noun_stems"))); readNounEndings(new File(config.getProperty("noun_endings"))); readDeclensions(new File(config.getProperty("declensions"))); // //Read FSTs - not implemented yet. // } /** * Transforms given word form from surface level into lexical level representation. * @param wf word form to parse. * @return list of possible morphological representations of the given form. */ public ArrayList parse(String wf) { ArrayList output = null; String stem = ""; // //Apply plugged in FSTs - not implemented yet. // //Searching for stem - hard-coded FST approach. for (int i = 0; i < wf.length(); i++) { //Read char by char from the surface type and look-up for corresponding stem in the stem tree. stem = stem + wf.charAt(i); if (stems.containsKey(stem)) { //Remaining part of the word form has to be legal ending for the paradigm(s), //which is/are associated with the stem in the lexicon. //Nevertheless search for longer stem if there is more than one char left on surface type. String ending = ""; if (i < wf.length()) ending = wf.substring(i + 1); if (!endings.containsKey(ending)) continue; //Read morphological features assigned to the stem. ArrayList stem_features = (ArrayList)stems.get(stem); Iterator itSF = stem_features.iterator(); while (itSF.hasNext()) { NounStemFeature nsf = (NounStemFeature)itSF.next(); String pos = nsf.getPOS(); String s_decl = nsf.getParadigm(); //Read morphological features assigned to the ending. ArrayList ending_features = (ArrayList)endings.get(ending); Iterator itEF = ending_features.iterator(); while (itEF.hasNext()) { NounEndingFeature nef = (NounEndingFeature)itEF.next(); String e_decl = nef.getParadigm(); String number = nef.getNumber(); String e_case = nef.getCase(); //Stem and ending feature mapping through joint paradigm. if (s_decl.equals(e_decl)) { //Word form accepted (recognized). //Map gender. String gender = (String)paradigms.get(s_decl); //Prepare output. if (output == null) output = new ArrayList(); output.add(stem + " " + pos + " " + gender + " " + number + " " + e_case); } } } } } return output; } }