1 package uk.ac.ebi.intenz.tools.sib.translator.rules;
2
3 import java.util.regex.Matcher;
4 import java.util.regex.Pattern;
5
6 import org.apache.log4j.Logger;
7
8
9
10
11
12
13
14
15
16
17
18
19 public class OrderedRules implements RuleGroup {
20
21 private static final Logger LOGGER = Logger.getLogger(OrderedRules.class);
22
23 private static OrderedRules INSTANCE = new OrderedRules();
24
25 private final String[][] RULES =
26 {
27 {"([Dd])elta\\((\\d+)\\((\\d+)\\((\\d+)\\)\\)\\)",
28 "<greek>$1elta</greek><smallsup>$2($3<smallsup>$4</smallsup>)</smallsup>"},
29 {"([Dd])elta\\((\\d+)\\((\\d+)\\)\\)", "<greek>$1elta</greek><smallsup>$2($3)</smallsup>"},
30 {"([Dd])elta\\((\\d+)\\)", "<greek>$1elta</greek><smallsup>$2</smallsup>"},
31 {"([^\\>])([dD])elta([^\\<])", "$1<greek>$2elta</greek>$3"},
32 {"([^\\>])([aA])lpha([^\\<])", "$1<greek>$2lpha</greek>$3"},
33 {"([^\\>])([gG])amma([^\\<])", "$1<greek>$2amma</greek>$3"},
34 {"([^\\>])([bB])eta([^\\<])", "$1<greek>$2eta</greek>$3"},
35 {"([^\\>])([eE])psilon([^\\<])", "$1<greek>$2psilon</greek>$3"},
36 {"\\<greek\\>beta\\<\\/greek\\>ine", "betaine"},
37 {"mu\\-", "<greek>mu</greek>-"},
38 {"N\\,N([\\'\\-])", "<element>N</element>,<element>N</element>$1"},
39 {"(\\W)N\\-([^\\dt])", "$1<element>N</element>-$2"},
40 {"^N\\-([^\\dt])", "<element>N</element>-$1"},
41 {"O\\(2\\)\\(\\-\\)", "O<smallsub>2</smallsub><smallsup>-</smallsup>"},
42 {"O\\(2\\)", "O<smallsub>2</smallsub>"},
43 {"\\<em\\_dash\\/\\>([DL])\\-", "<em_dash/><small>$1</small>-"},
44 {"\\<em\\_dash\\/\\>([aA])lpha", "<em_dash/><greek>$1lpha</greek>"},
45 {"\\<em\\_dash\\/\\>([bB])eta", "<em_dash/><greek>$1eta</greek>"},
46 {"\\<em\\_dash\\/\\>([gG])amma", "<em_dash/><greek>$1amma</greek>"},
47 {"\\<em\\_dash\\/\\>([dD])elta", "<em_dash/><greek>$1elta</greek>"},
48 {"\\<em\\_dash\\/\\>([eE])psilon", "<em_dash/><greek>$1psilon</greek>"},
49 {"([SOCN])(\\(\\d+)", "<element>$1</element>$2"},
50 {"\\-([SOCN])\\-", "-<element>$1</element>-"},
51 {"(\\W+?)([SOCN])\\-", "$1<element>$2</element>-"},
52 {"^([SOCN])\\-", "<element>$1</element>-"},
53 {"\\-([SOCN])(\\W)+?", "-<element>$1</element>$2"},
54 {"(\\-?)(\\d+)([DL])(\\(?)\\-", "$1$2<stereo>$3</stereo>$4-"},
55 {"\\-([DL])([\\-\\(])", "-<stereo>$1</stereo>$2"},
56 {"(\\W)([DL])([\\-\\(])", "$1<stereo>$2</stereo>$3"},
57 {"^([DL])([\\-\\(])", "<stereo>$1</stereo>$2"},
58 {"([\\-\\,])all-cis([\\-\\,])", "$1<stereo>all-cis</stereo>$2"},
59 {"[aA]ll-cis([\\-\\,])", "<stereo>all-cis</stereo>$1"},
60 {"([\\-\\,])all-trans([\\-\\,])", "$1<stereo>all-trans</stereo>$2"},
61 {"[aA]ll\\-trans([\\-\\,])", "<stereo>all-trans</stereo>$1"},
62 {"([\\-\\,])cis([\\-\\,])", "$1<stereo>cis</stereo>$2"},
63 {"[cC]is([\\-\\,])", "<stereo>cis</stereo>$1"},
64 {"([\\-\\,])trans([\\-\\,])", "$1<stereo>trans</stereo>$2"},
65 {"[tT]rans([\\-\\,])", "<stereo>trans</stereo>$1"},
66 {"\\-allo\\-", "-<stereo>allo</stereo>-"},
67
68 {"(\\W)([aA])llo\\-", "$1<stereo>$2llo</stereo>-"},
69 {"^([aA])llo\\-", "<stereo>$1llo</stereo>-"},
70 {"\\-altro\\-", "-<stereo>altro</stereo>-"},
71 {"(\\W)([aA])ltro\\-", "$1<stereo>$2ltro</stereo>-"},
72 {"^([aA])ltro\\-", "<stereo>$1ltro</stereo>-"},
73 {"\\-arabino\\-", "-<stereo>arabino</stereo>-"},
74 {"(\\W)([aA])rabino\\-", "$1<stereo>$2rabino</stereo>-"},
75 {"^([aA])rabino\\-", "<stereo>$1rabino</stereo>-"},
76 {"\\-erythro\\-", "-<stereo>erythro</stereo>-"},
77 {"(\\W)([eE])rythro\\-", "$1<stereo>$2rythro</stereo>-"},
78 {"^([eE])rythro\\-", "<stereo>$1rythro</stereo>-"},
79 {"\\-galacto\\-", "-<stereo>galacto</stereo>-"},
80 {"(\\W)([gG])alacto\\-", "$1<stereo>$2alacto</stereo>-"},
81 {"^([gG])alacto\\-", "<stereo>$1alacto</stereo>-"},
82 {"\\-gluco\\-", "-<stereo>gluco</stereo>-"},
83 {"(\\W)([gG])luco\\-", "$1<stereo>$2luco</stereo>-"},
84 {"^([gG])luco\\-", "<stereo>$1luco</stereo>-"},
85 {"\\-glycero\\-", "-<stereo>glycero</stereo>-"},
86 {"(\\W)([gG])lycero\\-", "$1<stereo>$2lycero</stereo>-"},
87 {"^([gG])lycero\\-", "<stereo>$1lycero</stereo>-"},
88 {"\\-gulo\\-", "-<stereo>gulo</stereo>-"},
89 {"(\\W)([gG])ulo\\-", "$1<stereo>$2ulo</stereo>-"},
90 {"^([gG])ulo\\-", "<stereo>$1ulo</stereo>-"},
91 {"\\-ido\\-", "-<stereo>ido</stereo>-"},
92 {"(\\W)([iI])do\\-", "$1<stereo>$2do</stereo>-"},
93 {"^([iI])do\\-", "<stereo>$1do</stereo>-"},
94 {"\\-lyxo\\-", "-<stereo>lyxo</stereo>-"},
95 {"(\\W)([lL])yxo\\-", "$1<stereo>$2yxo</stereo>-"},
96 {"^([lL])yxo\\-", "<stereo>$1yxo</stereo>-"},
97 {"\\-manno\\-", "-<stereo>manno</stereo>-"},
98 {"(\\W)([mM])anno\\-", "$1<stereo>$2anno</stereo>-"},
99 {"^([mM])anno\\-", "<stereo>$1anno</stereo>-"},
100 {"\\-ribo\\-", "-<stereo>ribo</stereo>-"},
101 {"(\\W)([rR])ibo\\-", "$1<stereo>$2ibo</stereo>-"},
102 {"^([rR])ibo\\-", "<stereo>$1ibo</stereo>-"},
103 {"\\-talo\\-", "-<stereo>talo</stereo>-"},
104 {"(\\W)([tT])alo\\-", "$1<stereo>$2alo</stereo>-"},
105 {"^([tT])alo\\-", "<stereo>$1alo</stereo>-"},
106 {"\\-threo\\-", "-<stereo>threo</stereo>-"},
107 {"(\\W)([tT])hreo\\-", "$1<stereo>$2hreo</stereo>-"},
108 {"^([tT])hreo\\-", "<stereo>$1hreo</stereo>-"},
109 {"\\-xylo\\-", "-<stereo>xylo</stereo>-"},
110 {"(\\W)([xX])ylo\\-", "$1<stereo>$2ylo</stereo>-"},
111 {"^([xX])ylo\\-", "<stereo>$1ylo</stereo>-"},
112 {"((\\d)\\((\\d))H(\\)-)","$1<element>H</element>$4"}
113 };
114
115 private final String[][] RULES_REVERSED =
116 {
117 {"\\<greek\\>([Dd])elta\\<\\/greek\\>\\<smallsup\\>(\\d+)\\((\\d+)\\<smallsup\\>(\\d+)\\<\\/smallsup\\>\\)\\<\\/smallsup\\>",
118 "$1elta($2($3($4)))"},
119 {"\\<greek\\>([Dd])elta\\<\\/greek\\>\\<smallsup\\>(\\d+)\\((\\d+)\\)" +
120 "\\<\\/smallsup\\>",
121 "$1elta($2($3))"},
122 {"\\<greek\\>([Dd])elta\\<\\/greek\\>\\<smallsup\\>(\\d+)\\<\\/smallsup\\>",
123 "$1elta($2)"},
124 {"(\\d+)\\<greek\\>", "$1-<greek>"},
125 {"\\<greek\\>([dD])elta\\<\\/greek\\>", "$1elta"},
126 {"\\<greek\\>([aA])lpha\\<\\/greek\\>", "$1lpha"},
127 {"\\<greek\\>([gG])amma\\<\\/greek\\>", "$1amma"},
128 {"\\<greek\\>([bB])eta\\<\\/greek\\>", "$1eta"},
129 {"\\<greek\\>([eE])psilon\\<\\/greek\\>", "$1psilon"},
130 {"\\<greek\\>mu\\<\\/greek\\>\\-", "mu-"},
131 {"\\<element\\>N\\<\\/element\\>\\,\\<element\\>N\\<\\/element\\>([\\'\\-])", "N,N$1"},
132 {"(\\W)\\<element\\>N\\<\\/element\\>\\-([^\\dt])", "$1N-$2"},
133 {"^\\<element\\>N\\<\\/element\\>\\-([^\\dt])", "N-$1"},
134 {"O\\<smallsub\\>2\\<\\/smallsub\\>\\<smallsup\\>\\-\\<\\/smallsup\\>",
135 "O(2)(-)"},
136 {"O\\<smallsub\\>2\\<\\/smallsub\\>", "O(2)"},
137 {"\\-\\<element\\>S\\<\\/element\\>\\-", "-S-"},
138 {"\\<element\\>S\\<\\/element\\>\\-", "S-"},
139 {"\\-\\<element\\>S\\<\\/element\\>", "-S"},
140 {"((\\d)\\((\\d))\\<element\\>H\\<\\/element\\>(\\)-)", "=$1H$4"},
141 {"\\<smallsub\\>(.+?)\\</smallsub\\>","($1)"},
142 {"\\<smallsup\\>(.+?)\\</smallsup\\>","($1)"},
143
144 {"\\[(?!Fe\\([23]\\+\\)|Co\\(II\\)|NiFe|[234]Fe-[24]S|Glu\\]|heparan sulfate|lipopolysaccharide glucose|blood group substance|[Mm]yelin[ -]proteolipid|3\\.2\\.2|4-vinyl|1,4\\]|cd|ambiguous|misleading|misprint|obsolete|incorrect|\\d,\\d-[af]\\]|\\d,\\d-<ital>[af]</ital>\\]|<ital>a</ital>\\]|tRNA\\]|ligated tRNA|-\\d|side [12])([^\\[\\]]*?)\\]","($1)"},
145 {"\\{([^\\}]*?\\([^\\}]*?\\)[^\\}]*?)\\}","($1)"},
146 {"\\{","("},
147 {"\\}", ")"}
148 };
149
150 private final String[] STEREO_TERMS = {"D", "L", "all-cis", "all-trans", "cis", "trans", "allo", "altro", "arabino", "erythro", "galacto",
151 "gluco", "glycero", "gulo", "ido", "lyxo", "manno", "ribo", "talo", "threo", "xylo"};
152
153
154
155
156
157 private OrderedRules() {
158 }
159
160
161
162
163
164
165
166
167 public static OrderedRules getInstance() {
168 return INSTANCE;
169 }
170
171
172
173
174
175
176
177
178 public String applyRules(String text) {
179 if (text == null) throw new NullPointerException("Parameter 'text' must not be null.");
180 text = applyStereoRSZERule(text);
181 text = translate(text, RULES);
182 text = applyDecapitalisationRule(text);
183 return text;
184 }
185
186
187
188
189
190
191
192
193 public String reverseRules(String text) {
194 if (text == null) throw new NullPointerException("Parameter 'text' must not be null.");
195 text = applyStereoRSZERuleReversed(text);
196 text = translate(text, RULES_REVERSED);
197 text = applyReverseStereoRules(text);
198 return text;
199 }
200
201
202
203
204
205
206
207
208 private String translate(String text, String[][] rules) {
209 for (int iii = 0; iii < rules.length; iii++) {
210 String[] rule = rules[iii];
211 text = text.replaceAll(rule[0], rule[1]);
212 }
213 return text;
214 }
215
216
217
218
219
220
221
222
223 private String applyStereoRSZERule(String text) {
224 Pattern coarseRSZEPattern = Pattern.compile("(\\((?:\\d*?A*?[RSZE]\\,?)+?\\))");
225 Matcher coarseRSZEMatcher = coarseRSZEPattern.matcher(text);
226 StringBuffer textStringBuffer = new StringBuffer();
227 boolean found = false;
228 int oldStart = 0;
229 int start = 0;
230 while (coarseRSZEMatcher.find()) {
231 found = true;
232 start = coarseRSZEMatcher.start(1);
233 String group = coarseRSZEMatcher.group(1);
234 Pattern RSZEPattern = Pattern.compile("([RSZE])");
235 Matcher RSZEMatcher = RSZEPattern.matcher(group);
236 group = RSZEMatcher.replaceAll("<stereo>$1</stereo>");
237 textStringBuffer.append(text.substring(oldStart, start) + group);
238 oldStart = coarseRSZEMatcher.end(1);
239 }
240 textStringBuffer.append(text.substring(oldStart));
241 if (found) text = textStringBuffer.toString();
242 return text;
243 }
244
245
246
247
248
249
250
251 private String applyStereoRSZERuleReversed(String text) {
252 return text.replaceAll("\"<stereo>([RSZE])</stereo>\"", "$1");
253 }
254
255
256
257
258
259
260
261 private String applyDecapitalisationRule(String text) {
262 Pattern pattern = Pattern.compile("^\\[(\\p{Upper})(\\p{Lower})");
263 Matcher matcher = pattern.matcher(text);
264 if (matcher.find()) {
265 text = matcher.replaceAll("[" + matcher.group(1).toLowerCase() + matcher.group(2));
266 }
267 return text;
268 }
269
270
271
272
273
274
275
276 private String applyReverseStereoRules(String text) {
277 for (int iii = 0; iii < STEREO_TERMS.length; iii++) {
278 String stereoTerm = STEREO_TERMS[iii];
279 text = text.replaceAll("([\\-\\,])<stereo>" + stereoTerm + "</stereo>([\\-\\,\\(])", "$1" + stereoTerm + "$2");
280 text = text.replaceAll("<stereo>" + stereoTerm + "</stereo>([\\-\\,\\(])", stereoTerm + "$1");
281 }
282 return text;
283 }
284 }