View Javadoc
1   /*
2   Copyright (c) 2005 The European Bioinformatics Institute, and others.
3   All rights reserved. Please see the file LICENSE
4   in the root directory of this distribution.
5   */
6   package uk.ac.ebi.intenz.tools.sib.translator.rules;
7   
8   import java.io.BufferedReader;
9   import java.io.IOException;
10  import java.io.InputStream;
11  import java.io.InputStreamReader;
12  import java.util.ArrayList;
13  import java.util.regex.Matcher;
14  import java.util.regex.Pattern;
15  
16  import org.apache.log4j.Logger;
17  
18  
19  /**
20   * GrammarRules
21   *
22   * @author P. de Matos
23   * @version $id 15-Jul-2005 14:41:31
24   *          <p/>
25   *          History:<br>
26   *          <table>
27   *          <tr><th>Developer</th><th>Date</th><th>Description</th></tr>
28   *          <tr><td>P.de Matos</td><td>15-Jul-2005</td><td>Created class</td></tr>
29   *          </table>
30   */
31  public class GrammarRules implements RuleGroup {
32  
33     private static final Logger LOGGER = Logger.getLogger(GrammarRules.class);
34  
35    private static GrammarRules INSTANCE = new GrammarRules();
36  
37     private final String[][] CAPITILISATION_RULES_REVERSED = {
38       {"^TRNA", "tRNA"},
39       {"^MRNA", "mRNA"},
40       {"^RRNA", "rRNA"},
41       {"^CDNA", "cDNA"},
42       {"^DAMP", "dAMP"},
43       {"^DADP", "dADP"},
44       {"^DATP", "dATP"},
45       {"^DUMP", "dUMP"},
46       {"^DUDP", "dUDP"},
47       {"^DUTP", "dUTP"},
48       {"^DCMP", "dCMP"},
49       {"^DCDP", "dCDP"},
50       {"^DCTP", "dCTP"},
51       {"^DGMP", "dGMP"},
52       {"^DGDP", "dGDP"},
53       {"^DGTP", "dGTP"},
54       {"^DTMP", "dTMP"},
55       {"^DTDP", "dTDP"},
56       {"^DTTP", "dTTP"},
57       {"^CAMP", "cAMP"},
58       {"^CGMP", "cGMP"}
59  
60    };
61  
62     // These are done elsewhere so we exclude them
63     private final String[] CAPITILISATION_EXCLUSIONS = {
64        "^p\\-",
65        "^m\\-",
66        "^o\\-",
67        "^n\\-",
68        "^tRNA",
69        "^tRNase",
70        "^mRNA",
71        "^rRNA",
72        "^cDNA",
73        "^dAMP",
74        "^dADP",
75        "^dATP",
76        "^dUMP",
77        "^dUDP",
78        "^dUTP",
79        "^dCMP",
80        "^dCDP",
81        "^dCTP",
82        "^dGMP",
83        "^dGDP",
84        "^dGTP",
85        "^dTMP",
86        "^dTDP",
87        "^dTTP",
88        "^cAMP",
89        "^cGMP",
90        "^ppGpp",
91        "^pppGpp",
92        "^\\(ppGpp\\)",
93        "^\\(fMet\\)",
94        "^fMet",
95        "^n\\s",
96        "^m\\s",
97        "^tPA",
98        "^(\\[|\\(|\\s)eIF",
99        "^sn-",
100       "^\\d*?\\([\\+\\-\\d]+?\\)\\-",
101       "^cd-"
102    };
103 
104    private final Pattern[] SNEAKY_CAPIT_PATTERNS = {
105        Pattern.compile("^(\\[|\\()(\\p{Lower})(\\p{Lower})")
106    };
107 
108 
109    private final Pattern SNEAKY_CAPIT_PATTERN = Pattern.compile("^(\\[|\\()(\\w)");
110 
111    private final ArrayList DECAPITALIZATION_PATTERNS = new ArrayList();
112 
113    private GrammarRules () {
114         InputStream stream = this.getClass().getClassLoader().getResourceAsStream("decapitalization_patterns.txt");
115         BufferedReader br = new BufferedReader(new InputStreamReader(stream));
116         String pattern;
117         try {
118             pattern = br.readLine();
119             while (pattern != null){
120                 DECAPITALIZATION_PATTERNS.add(pattern);
121                 pattern = br.readLine();
122             }
123         } catch (IOException e) {
124             e.printStackTrace();
125         } finally {
126             try {
127                 br.close();
128                 stream.close();
129             } catch (IOException e){}
130         }
131     }
132 
133    /**
134      * Returns the sole instance of this class.
135      * <p/>
136      * If no instance is available yet then it will be created.
137      *
138      * @return the class's sole instance.
139      */
140     public static GrammarRules getInstance() {
141       return INSTANCE;
142     }
143 
144 
145    public String applyRules (String text) {
146     return reverseRules(text);
147    }
148 
149    public String reverseRules(String text) {
150         if (text == null)
151             throw new NullPointerException("Parameter 'text' must not be null.");
152         if (text.length() == 0)
153             return text;
154 
155         // Capitalise the first letter of a line.
156         boolean excludeFromCapitilisation = false;
157         for (int iii = 0; iii < CAPITILISATION_EXCLUSIONS.length; iii++) {
158             String exclusion = CAPITILISATION_EXCLUSIONS[iii];
159             if (Pattern.compile(exclusion).matcher(text).find()) {
160                 excludeFromCapitilisation = true;
161                 break;
162             }
163         }
164         if (!excludeFromCapitilisation)
165             text = doCapitilisation(text);
166         text = doDecapitalisation(text);
167 
168         return text;
169     }
170 
171    private String doCapitilisation (String text) {
172       for (int i = 0; i < SNEAKY_CAPIT_PATTERNS.length; i++){
173           Matcher matcher = SNEAKY_CAPIT_PATTERNS[i].matcher(text);
174           if (matcher.find()){
175               text = matcher.replaceAll(matcher.group(1) + matcher.group(2).toUpperCase() + matcher.group(3));
176           }
177       }
178       text = text.substring(0,1).toUpperCase() + text.substring(1);
179       text = translate(text, CAPITILISATION_RULES_REVERSED);
180       return text;
181    }
182 
183     private String doDecapitalisation(String text) {
184         for (int i = 0; i < DECAPITALIZATION_PATTERNS.size(); i++){
185             String s = (String) DECAPITALIZATION_PATTERNS.get(i);
186             Pattern pattern = Pattern.compile("(^|\\.\\s)(" + s + ")(\\p{Upper})");
187             Matcher matcher = pattern.matcher(text);
188             while (matcher.find()){
189                 text = matcher.replaceFirst(matcher.group(1) + matcher.group(2) + matcher.group(3).toLowerCase());
190                 matcher = pattern.matcher(text);
191             }
192         }
193         return text;
194     }
195 
196    /**
197    * This method performs the actual application of the rules.
198    *
199    * @param text  The text to be translated.
200    * @param rules Map of rules to be applied.
201    * @return the translated text.
202    */
203   private String translate(String text, String[][] rules) {
204     for (int iii = 0; iii < rules.length; iii++) {
205       String[] rule = rules[iii];
206       text = text.replaceAll(rule[0], rule[1]);
207     }
208     return text;
209   }
210 }