View Javadoc

1   package uk.ac.ebi.intenz.tools.sib.translator.rules;
2   
3   import java.util.regex.Matcher;
4   import java.util.regex.Pattern;
5   
6   import org.apache.log4j.Logger;
7   
8   /**
9    * This singleton class stores regular expression rules to be used to transform text.
10   * <p/>
11   * Each rule consists of a regular expression pattern and a replacement string. The rules will be applied in an
12   * arbitrary order.
13   * <p/>
14   * These rules should only be applied on enzyme reaction data.
15   *
16   * @author Michael Darsow
17   * @version $Revision: 1.3 $ $Date: 2009/03/27 15:33:26 $
18   */
19  public class OrderedRules implements RuleGroup {
20  
21    private static final Logger LOGGER = Logger.getLogger(OrderedRules.class);
22  
23    private static OrderedRules INSTANCE = new OrderedRules();
24  
25    private final String[][] RULES =
26            {
27              {"([Dd])elta\\((\\d+)\\((\\d+)\\((\\d+)\\)\\)\\)",
28               "<greek>$1elta</greek><smallsup>$2($3<smallsup>$4</smallsup>)</smallsup>"},
29              {"([Dd])elta\\((\\d+)\\((\\d+)\\)\\)", "<greek>$1elta</greek><smallsup>$2($3)</smallsup>"},
30              {"([Dd])elta\\((\\d+)\\)", "<greek>$1elta</greek><smallsup>$2</smallsup>"},
31              {"([^\\>])([dD])elta([^\\<])", "$1<greek>$2elta</greek>$3"},
32              {"([^\\>])([aA])lpha([^\\<])", "$1<greek>$2lpha</greek>$3"},
33              {"([^\\>])([gG])amma([^\\<])", "$1<greek>$2amma</greek>$3"},
34              {"([^\\>])([bB])eta([^\\<])", "$1<greek>$2eta</greek>$3"},
35              {"([^\\>])([eE])psilon([^\\<])", "$1<greek>$2psilon</greek>$3"},
36              {"\\<greek\\>beta\\<\\/greek\\>ine", "betaine"},
37              {"mu\\-", "<greek>mu</greek>-"},
38              {"N\\,N([\\'\\-])", "<element>N</element>,<element>N</element>$1"},
39              {"(\\W)N\\-([^\\dt])", "$1<element>N</element>-$2"},
40              {"^N\\-([^\\dt])", "<element>N</element>-$1"},
41              {"O\\(2\\)\\(\\-\\)", "O<smallsub>2</smallsub><smallsup>-</smallsup>"},
42              {"O\\(2\\)", "O<smallsub>2</smallsub>"},
43              {"\\<em\\_dash\\/\\>([DL])\\-", "<em_dash/><small>$1</small>-"},
44              {"\\<em\\_dash\\/\\>([aA])lpha", "<em_dash/><greek>$1lpha</greek>"},
45              {"\\<em\\_dash\\/\\>([bB])eta", "<em_dash/><greek>$1eta</greek>"},
46              {"\\<em\\_dash\\/\\>([gG])amma", "<em_dash/><greek>$1amma</greek>"},
47              {"\\<em\\_dash\\/\\>([dD])elta", "<em_dash/><greek>$1elta</greek>"},
48              {"\\<em\\_dash\\/\\>([eE])psilon", "<em_dash/><greek>$1psilon</greek>"},
49              {"([SOCN])(\\(\\d+)", "<element>$1</element>$2"},
50              {"\\-([SOCN])\\-", "-<element>$1</element>-"},
51              {"(\\W+?)([SOCN])\\-", "$1<element>$2</element>-"},
52              {"^([SOCN])\\-", "<element>$1</element>-"},
53              {"\\-([SOCN])(\\W)+?", "-<element>$1</element>$2"},
54              {"(\\-?)(\\d+)([DL])(\\(?)\\-", "$1$2<stereo>$3</stereo>$4-"},
55              {"\\-([DL])([\\-\\(])", "-<stereo>$1</stereo>$2"},
56              {"(\\W)([DL])([\\-\\(])", "$1<stereo>$2</stereo>$3"},
57              {"^([DL])([\\-\\(])", "<stereo>$1</stereo>$2"},
58              {"([\\-\\,])all-cis([\\-\\,])", "$1<stereo>all-cis</stereo>$2"},
59              {"[aA]ll-cis([\\-\\,])", "<stereo>all-cis</stereo>$1"},
60              {"([\\-\\,])all-trans([\\-\\,])", "$1<stereo>all-trans</stereo>$2"},
61              {"[aA]ll\\-trans([\\-\\,])", "<stereo>all-trans</stereo>$1"},
62              {"([\\-\\,])cis([\\-\\,])", "$1<stereo>cis</stereo>$2"},
63              {"[cC]is([\\-\\,])", "<stereo>cis</stereo>$1"},
64              {"([\\-\\,])trans([\\-\\,])", "$1<stereo>trans</stereo>$2"},
65              {"[tT]rans([\\-\\,])", "<stereo>trans</stereo>$1"},
66              {"\\-allo\\-", "-<stereo>allo</stereo>-"},
67  
68              {"(\\W)([aA])llo\\-", "$1<stereo>$2llo</stereo>-"},
69              {"^([aA])llo\\-", "<stereo>$1llo</stereo>-"},
70              {"\\-altro\\-", "-<stereo>altro</stereo>-"},
71              {"(\\W)([aA])ltro\\-", "$1<stereo>$2ltro</stereo>-"},
72              {"^([aA])ltro\\-", "<stereo>$1ltro</stereo>-"},
73              {"\\-arabino\\-", "-<stereo>arabino</stereo>-"},
74              {"(\\W)([aA])rabino\\-", "$1<stereo>$2rabino</stereo>-"},
75              {"^([aA])rabino\\-", "<stereo>$1rabino</stereo>-"},
76              {"\\-erythro\\-", "-<stereo>erythro</stereo>-"},
77              {"(\\W)([eE])rythro\\-", "$1<stereo>$2rythro</stereo>-"},
78              {"^([eE])rythro\\-", "<stereo>$1rythro</stereo>-"},
79              {"\\-galacto\\-", "-<stereo>galacto</stereo>-"},
80              {"(\\W)([gG])alacto\\-", "$1<stereo>$2alacto</stereo>-"},
81              {"^([gG])alacto\\-", "<stereo>$1alacto</stereo>-"},
82              {"\\-gluco\\-", "-<stereo>gluco</stereo>-"},
83              {"(\\W)([gG])luco\\-", "$1<stereo>$2luco</stereo>-"},
84              {"^([gG])luco\\-", "<stereo>$1luco</stereo>-"},
85              {"\\-glycero\\-", "-<stereo>glycero</stereo>-"},
86              {"(\\W)([gG])lycero\\-", "$1<stereo>$2lycero</stereo>-"},
87              {"^([gG])lycero\\-", "<stereo>$1lycero</stereo>-"},
88              {"\\-gulo\\-", "-<stereo>gulo</stereo>-"},
89              {"(\\W)([gG])ulo\\-", "$1<stereo>$2ulo</stereo>-"},
90              {"^([gG])ulo\\-", "<stereo>$1ulo</stereo>-"},
91              {"\\-ido\\-", "-<stereo>ido</stereo>-"},
92              {"(\\W)([iI])do\\-", "$1<stereo>$2do</stereo>-"},
93              {"^([iI])do\\-", "<stereo>$1do</stereo>-"},
94              {"\\-lyxo\\-", "-<stereo>lyxo</stereo>-"},
95              {"(\\W)([lL])yxo\\-", "$1<stereo>$2yxo</stereo>-"},
96              {"^([lL])yxo\\-", "<stereo>$1yxo</stereo>-"},
97              {"\\-manno\\-", "-<stereo>manno</stereo>-"},
98              {"(\\W)([mM])anno\\-", "$1<stereo>$2anno</stereo>-"},
99              {"^([mM])anno\\-", "<stereo>$1anno</stereo>-"},
100             {"\\-ribo\\-", "-<stereo>ribo</stereo>-"},
101             {"(\\W)([rR])ibo\\-", "$1<stereo>$2ibo</stereo>-"},
102             {"^([rR])ibo\\-", "<stereo>$1ibo</stereo>-"},
103             {"\\-talo\\-", "-<stereo>talo</stereo>-"},
104             {"(\\W)([tT])alo\\-", "$1<stereo>$2alo</stereo>-"},
105             {"^([tT])alo\\-", "<stereo>$1alo</stereo>-"},
106             {"\\-threo\\-", "-<stereo>threo</stereo>-"},
107             {"(\\W)([tT])hreo\\-", "$1<stereo>$2hreo</stereo>-"},
108             {"^([tT])hreo\\-", "<stereo>$1hreo</stereo>-"},
109             {"\\-xylo\\-", "-<stereo>xylo</stereo>-"},
110             {"(\\W)([xX])ylo\\-", "$1<stereo>$2ylo</stereo>-"},
111             {"^([xX])ylo\\-", "<stereo>$1ylo</stereo>-"},
112             {"((\\d)\\((\\d))H(\\)-)","$1<element>H</element>$4"}
113           };
114 
115   private final String[][] RULES_REVERSED =
116           {
117             {"\\<greek\\>([Dd])elta\\<\\/greek\\>\\<smallsup\\>(\\d+)\\((\\d+)\\<smallsup\\>(\\d+)\\<\\/smallsup\\>\\)\\<\\/smallsup\\>",
118              "$1elta($2($3($4)))"},
119             {"\\<greek\\>([Dd])elta\\<\\/greek\\>\\<smallsup\\>(\\d+)\\((\\d+)\\)" +
120              "\\<\\/smallsup\\>",
121              "$1elta($2($3))"},
122             {"\\<greek\\>([Dd])elta\\<\\/greek\\>\\<smallsup\\>(\\d+)\\<\\/smallsup\\>",
123              "$1elta($2)"},
124             {"(\\d+)\\<greek\\>", "$1-<greek>"},
125             {"\\<greek\\>([dD])elta\\<\\/greek\\>", "$1elta"},
126             {"\\<greek\\>([aA])lpha\\<\\/greek\\>", "$1lpha"},
127             {"\\<greek\\>([gG])amma\\<\\/greek\\>", "$1amma"},
128             {"\\<greek\\>([bB])eta\\<\\/greek\\>", "$1eta"},
129             {"\\<greek\\>([eE])psilon\\<\\/greek\\>", "$1psilon"},
130             {"\\<greek\\>mu\\<\\/greek\\>\\-", "mu-"},
131             {"\\<element\\>N\\<\\/element\\>\\,\\<element\\>N\\<\\/element\\>([\\'\\-])", "N,N$1"},
132             {"(\\W)\\<element\\>N\\<\\/element\\>\\-([^\\dt])", "$1N-$2"},
133             {"^\\<element\\>N\\<\\/element\\>\\-([^\\dt])", "N-$1"},
134             {"O\\<smallsub\\>2\\<\\/smallsub\\>\\<smallsup\\>\\-\\<\\/smallsup\\>",
135              "O(2)(-)"},
136             {"O\\<smallsub\\>2\\<\\/smallsub\\>", "O(2)"},
137             {"\\-\\<element\\>S\\<\\/element\\>\\-", "-S-"},
138             {"\\<element\\>S\\<\\/element\\>\\-", "S-"},
139             {"\\-\\<element\\>S\\<\\/element\\>", "-S"},
140             {"((\\d)\\((\\d))\\<element\\>H\\<\\/element\\>(\\)-)", "=$1H$4"},
141             {"\\<smallsub\\>(.+?)\\</smallsub\\>","($1)"},
142             {"\\<smallsup\\>(.+?)\\</smallsup\\>","($1)"},
143 // TODO: TAKE THE EXCEPTIONS OUT OF HERE TO A TEXT FILE!:
144             {"\\[(?!Fe\\([23]\\+\\)|Co\\(II\\)|NiFe|[234]Fe-[24]S|Glu\\]|heparan sulfate|lipopolysaccharide glucose|blood group substance|[Mm]yelin[ -]proteolipid|3\\.2\\.2|4-vinyl|1,4\\]|cd|ambiguous|misleading|misprint|obsolete|incorrect|\\d,\\d-[af]\\]|\\d,\\d-<ital>[af]</ital>\\]|<ital>a</ital>\\]|tRNA\\]|ligated tRNA|-\\d|side [12])([^\\[\\]]*?)\\]","($1)"},        // square brackets [ ]
145             {"\\{([^\\}]*?\\([^\\}]*?\\)[^\\}]*?)\\}","($1)"},        // curly brackets { }
146             {"\\{","("},
147             {"\\}", ")"}
148           };
149 
150   private final String[] STEREO_TERMS = {"D", "L", "all-cis", "all-trans", "cis", "trans", "allo", "altro", "arabino", "erythro", "galacto",
151                             "gluco", "glycero", "gulo", "ido", "lyxo", "manno", "ribo", "talo", "threo", "xylo"};
152 
153 
154   /**
155    * Initialises the class's sole instance.
156    */
157   private OrderedRules() {
158   }
159 
160   /**
161    * Returns the sole instance of this class.
162    * <p/>
163    * If no instance is available yet then it will be created.
164    *
165    * @return the class's sole instance.
166    */
167   public static OrderedRules getInstance() {
168     return INSTANCE;
169   }
170 
171   /**
172    * Applies the rules.
173    *
174    * @param text The text to be translated.
175    * @return the translated text.
176    * @throws NullPointerException if <code>text</code> is <code>null</code>.
177    */
178   public String applyRules(String text) {
179     if (text == null) throw new NullPointerException("Parameter 'text' must not be null.");
180     text = applyStereoRSZERule(text);
181     text = translate(text, RULES);
182     text = applyDecapitalisationRule(text);
183     return text;
184   }
185 
186   /**
187    * Applies the reverse rules.
188    *
189    * @param text The text to be translated.
190    * @return the translated text.
191    * @throws NullPointerException if <code>text</code> is <code>null</code>.
192    */
193   public String reverseRules(String text) {
194     if (text == null) throw new NullPointerException("Parameter 'text' must not be null.");
195     text = applyStereoRSZERuleReversed(text);
196     text = translate(text, RULES_REVERSED);
197     text = applyReverseStereoRules(text);
198     return text;
199   }
200 
201     /**
202    * This method performs the actual application of the rules.
203    *
204    * @param text  The text to be translated.
205    * @param rules Map of rules to be applied.
206    * @return the translated text.
207    */
208   private String translate(String text, String[][] rules) {
209     for (int iii = 0; iii < rules.length; iii++) {
210       String[] rule = rules[iii];
211       text = text.replaceAll(rule[0], rule[1]);
212     }
213     return text;
214   }
215 
216   /**
217    * Transforms R, S, Z, E letters into <code>&lt;stereo&gt;<i>R, S, Z or E</i>&lt;/stereo&gt;</code> if these letters
218    * occur within parenthesis and are only preceded by numbers and/or 'A's.
219    *
220    * @param text Text to be translated.
221    * @return the translated text.
222    */
223   private String applyStereoRSZERule(String text) {
224     Pattern coarseRSZEPattern = Pattern.compile("(\\((?:\\d*?A*?[RSZE]\\,?)+?\\))");
225     Matcher coarseRSZEMatcher = coarseRSZEPattern.matcher(text);
226     StringBuffer textStringBuffer = new StringBuffer();
227     boolean found = false;
228     int oldStart = 0;
229     int start = 0;
230     while (coarseRSZEMatcher.find()) {
231       found = true;
232       start = coarseRSZEMatcher.start(1);
233       String group = coarseRSZEMatcher.group(1);
234       Pattern RSZEPattern = Pattern.compile("([RSZE])");
235       Matcher RSZEMatcher = RSZEPattern.matcher(group);
236       group = RSZEMatcher.replaceAll("<stereo>$1</stereo>");
237       textStringBuffer.append(text.substring(oldStart, start) + group);
238       oldStart = coarseRSZEMatcher.end(1);
239     }
240     textStringBuffer.append(text.substring(oldStart));
241     if (found) text = textStringBuffer.toString();
242     return text;
243   }
244 
245   /**
246    * Reverses the rule applied in {@link OrderedRules#applyStereoRSZERule(String)}.
247    *
248    * @param text Text to be translated.
249    * @return the translated text.
250    */
251   private String applyStereoRSZERuleReversed(String text) {
252     return text.replaceAll("\"<stereo>([RSZE])</stereo>\"", "$1");
253   }
254 
255   /**
256    * Decapitalises words which follow a square bracket.
257    *
258    * @param text The text to be translated.
259    * @return the translated text.
260    */
261   private String applyDecapitalisationRule(String text) {
262     Pattern pattern = Pattern.compile("^\\[(\\p{Upper})(\\p{Lower})");
263     Matcher matcher = pattern.matcher(text);
264     if (matcher.find()) {
265       text = matcher.replaceAll("[" + matcher.group(1).toLowerCase() + matcher.group(2));
266     }
267     return text;
268   }
269 
270   /**
271    * Applies reverse <code>STEREO</code> rules.
272    *
273    * @param text The text to be translated.
274    * @return the translated text.
275    */
276   private String applyReverseStereoRules(String text) {
277     for (int iii = 0; iii < STEREO_TERMS.length; iii++) {
278       String stereoTerm = STEREO_TERMS[iii];
279       text = text.replaceAll("([\\-\\,])<stereo>" + stereoTerm + "</stereo>([\\-\\,\\(])", "$1" + stereoTerm + "$2");
280       text = text.replaceAll("<stereo>" + stereoTerm + "</stereo>([\\-\\,\\(])", stereoTerm + "$1");
281     }
282     return text;
283   }
284 }