View Javadoc

1   /*
2   Copyright (c) 2005 The European Bioinformatics Institute, and others.
3   All rights reserved. Please see the file LICENSE
4   in the root directory of this distribution.
5   */
6   package uk.ac.ebi.intenz.tools.sib.translator.rules;
7   
8   import java.io.BufferedReader;
9   import java.io.IOException;
10  import java.io.InputStream;
11  import java.io.InputStreamReader;
12  import java.util.ArrayList;
13  import java.util.regex.Matcher;
14  import java.util.regex.Pattern;
15  
16  import org.apache.log4j.Logger;
17  
18  
19  /**
20   * GrammarRules
21   *
22   * @author P. de Matos
23   * @version $id 15-Jul-2005 14:41:31
24   *          <p/>
25   *          History:<br>
26   *          <table>
27   *          <tr><th>Developer</th><th>Date</th><th>Description</th></tr>
28   *          <tr><td>P.de Matos</td><td>15-Jul-2005</td><td>Created class</td></tr>
29   *          </table>
30   */
31  public class GrammarRules implements RuleGroup {
32  
33     private static final Logger LOGGER = Logger.getLogger(GrammarRules.class);
34  
35    private static GrammarRules INSTANCE = new GrammarRules();
36  
37     private final String[][] CAPITILISATION_RULES_REVERSED = {
38       {"^TRNA", "tRNA"},
39       {"^MRNA", "mRNA"},
40       {"^RRNA", "rRNA"},
41       {"^CDNA", "cDNA"},
42       {"^DAMP", "dAMP"},
43       {"^DADP", "dADP"},
44       {"^DATP", "dATP"},
45       {"^DUMP", "dUMP"},
46       {"^DUDP", "dUDP"},
47       {"^DUTP", "dUTP"},
48       {"^DCMP", "dCMP"},
49       {"^DCDP", "dCDP"},
50       {"^DCTP", "dCTP"},
51       {"^DGMP", "dGMP"},
52       {"^DGDP", "dGDP"},
53       {"^DGTP", "dGTP"},
54       {"^DTMP", "dTMP"},
55       {"^DTDP", "dTDP"},
56       {"^DTTP", "dTTP"},
57       {"^CAMP", "cAMP"},
58       {"^CGMP", "cGMP"}
59  
60    };
61  
62     // These are done elsewhere so we exclude them
63     private final String[] CAPITILISATION_EXCLUSIONS = {
64        "^p\\-",
65        "^m\\-",
66        "^o\\-",
67        "^n\\-",
68        "^tRNA",
69        "^tRNase",
70        "^mRNA",
71        "^rRNA",
72        "^cDNA",
73        "^dAMP",
74        "^dADP",
75        "^dATP",
76        "^dUMP",
77        "^dUDP",
78        "^dUTP",
79        "^dCMP",
80        "^dCDP",
81        "^dCTP",
82        "^dGMP",
83        "^dGDP",
84        "^dGTP",
85        "^dTMP",
86        "^dTDP",
87        "^dTTP",
88        "^dIDP",
89        "^dITP",
90        "^cAMP",
91        "^cGMP",
92        "^ppGpp",
93        "^pppGpp",
94        "^\\(ppGpp\\)",
95        "^\\(fMet\\)",
96        "^fMet",
97        "^n\\s",
98        "^m\\s",
99        "^tPA",
100       "^(\\[|\\(|\\s)eIF",
101       "^sn-",
102       "^\\d*?\\([\\+\\-\\d]+?\\)\\-",
103       "^cd-"
104    };
105 
106    private final Pattern[] SNEAKY_CAPIT_PATTERNS = {
107        Pattern.compile("^(\\[|\\()(\\p{Lower})(\\p{Lower})")
108    };
109 
110 
111    private final Pattern SNEAKY_CAPIT_PATTERN = Pattern.compile("^(\\[|\\()(\\w)");
112 
113    private final ArrayList DECAPITALIZATION_PATTERNS = new ArrayList();
114 
115    private GrammarRules () {
116         InputStream stream = this.getClass().getClassLoader().getResourceAsStream("decapitalization_patterns.txt");
117         BufferedReader br = new BufferedReader(new InputStreamReader(stream));
118         String pattern;
119         try {
120             pattern = br.readLine();
121             while (pattern != null){
122                 DECAPITALIZATION_PATTERNS.add(pattern);
123                 pattern = br.readLine();
124             }
125         } catch (IOException e) {
126             e.printStackTrace();
127         } finally {
128             try {
129                 br.close();
130                 stream.close();
131             } catch (IOException e){}
132         }
133     }
134 
135    /**
136      * Returns the sole instance of this class.
137      * <p/>
138      * If no instance is available yet then it will be created.
139      *
140      * @return the class's sole instance.
141      */
142     public static GrammarRules getInstance() {
143       return INSTANCE;
144     }
145 
146 
147    public String applyRules (String text) {
148     return reverseRules(text);
149    }
150 
151    public String reverseRules(String text) {
152         if (text == null)
153             throw new NullPointerException("Parameter 'text' must not be null.");
154         if (text.length() == 0)
155             return text;
156 
157         // Capitalise the first letter of a line.
158         boolean excludeFromCapitilisation = false;
159         for (int iii = 0; iii < CAPITILISATION_EXCLUSIONS.length; iii++) {
160             String exclusion = CAPITILISATION_EXCLUSIONS[iii];
161             if (Pattern.compile(exclusion).matcher(text).find()) {
162                 excludeFromCapitilisation = true;
163                 break;
164             }
165         }
166         if (!excludeFromCapitilisation)
167             text = doCapitilisation(text);
168         text = doDecapitalisation(text);
169 
170         return text;
171     }
172 
173    private String doCapitilisation (String text) {
174       for (int i = 0; i < SNEAKY_CAPIT_PATTERNS.length; i++){
175           Matcher matcher = SNEAKY_CAPIT_PATTERNS[i].matcher(text);
176           if (matcher.find()){
177               text = matcher.replaceAll(matcher.group(1) + matcher.group(2).toUpperCase() + matcher.group(3));
178           }
179       }
180       text = text.substring(0,1).toUpperCase() + text.substring(1);
181       text = translate(text, CAPITILISATION_RULES_REVERSED);
182       return text;
183    }
184 
185     private String doDecapitalisation(String text) {
186         for (int i = 0; i < DECAPITALIZATION_PATTERNS.size(); i++){
187             String s = (String) DECAPITALIZATION_PATTERNS.get(i);
188             Pattern pattern = Pattern.compile("(^|\\.\\s)(" + s + ")(\\p{Upper})");
189             Matcher matcher = pattern.matcher(text);
190             while (matcher.find()){
191                 text = matcher.replaceFirst(matcher.group(1) + matcher.group(2) + matcher.group(3).toLowerCase());
192                 matcher = pattern.matcher(text);
193             }
194         }
195         return text;
196     }
197 
198    /**
199    * This method performs the actual application of the rules.
200    *
201    * @param text  The text to be translated.
202    * @param rules Map of rules to be applied.
203    * @return the translated text.
204    */
205   private String translate(String text, String[][] rules) {
206     for (int iii = 0; iii < rules.length; iii++) {
207       String[] rule = rules[iii];
208       text = text.replaceAll(rule[0], rule[1]);
209     }
210     return text;
211   }
212 }