1
2
3
4
5
6 package uk.ac.ebi.intenz.tools.sib.translator.rules;
7
8 import java.io.BufferedReader;
9 import java.io.IOException;
10 import java.io.InputStream;
11 import java.io.InputStreamReader;
12 import java.util.ArrayList;
13 import java.util.regex.Matcher;
14 import java.util.regex.Pattern;
15
16 import org.apache.log4j.Logger;
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31 public class GrammarRules implements RuleGroup {
32
33 private static final Logger LOGGER = Logger.getLogger(GrammarRules.class);
34
35 private static GrammarRules INSTANCE = new GrammarRules();
36
37 private final String[][] CAPITILISATION_RULES_REVERSED = {
38 {"^TRNA", "tRNA"},
39 {"^MRNA", "mRNA"},
40 {"^RRNA", "rRNA"},
41 {"^CDNA", "cDNA"},
42 {"^DAMP", "dAMP"},
43 {"^DADP", "dADP"},
44 {"^DATP", "dATP"},
45 {"^DUMP", "dUMP"},
46 {"^DUDP", "dUDP"},
47 {"^DUTP", "dUTP"},
48 {"^DCMP", "dCMP"},
49 {"^DCDP", "dCDP"},
50 {"^DCTP", "dCTP"},
51 {"^DGMP", "dGMP"},
52 {"^DGDP", "dGDP"},
53 {"^DGTP", "dGTP"},
54 {"^DTMP", "dTMP"},
55 {"^DTDP", "dTDP"},
56 {"^DTTP", "dTTP"},
57 {"^CAMP", "cAMP"},
58 {"^CGMP", "cGMP"}
59
60 };
61
62
63 private final String[] CAPITILISATION_EXCLUSIONS = {
64 "^p\\-",
65 "^m\\-",
66 "^o\\-",
67 "^n\\-",
68 "^tRNA",
69 "^tRNase",
70 "^mRNA",
71 "^rRNA",
72 "^cDNA",
73 "^dAMP",
74 "^dADP",
75 "^dATP",
76 "^dUMP",
77 "^dUDP",
78 "^dUTP",
79 "^dCMP",
80 "^dCDP",
81 "^dCTP",
82 "^dGMP",
83 "^dGDP",
84 "^dGTP",
85 "^dTMP",
86 "^dTDP",
87 "^dTTP",
88 "^cAMP",
89 "^cGMP",
90 "^ppGpp",
91 "^pppGpp",
92 "^\\(ppGpp\\)",
93 "^\\(fMet\\)",
94 "^fMet",
95 "^n\\s",
96 "^m\\s",
97 "^tPA",
98 "^(\\[|\\(|\\s)eIF",
99 "^sn-",
100 "^\\d*?\\([\\+\\-\\d]+?\\)\\-",
101 "^cd-"
102 };
103
104 private final Pattern[] SNEAKY_CAPIT_PATTERNS = {
105 Pattern.compile("^(\\[|\\()(\\p{Lower})(\\p{Lower})")
106 };
107
108
109 private final Pattern SNEAKY_CAPIT_PATTERN = Pattern.compile("^(\\[|\\()(\\w)");
110
111 private final ArrayList DECAPITALIZATION_PATTERNS = new ArrayList();
112
113 private GrammarRules () {
114 InputStream stream = this.getClass().getClassLoader().getResourceAsStream("decapitalization_patterns.txt");
115 BufferedReader br = new BufferedReader(new InputStreamReader(stream));
116 String pattern;
117 try {
118 pattern = br.readLine();
119 while (pattern != null){
120 DECAPITALIZATION_PATTERNS.add(pattern);
121 pattern = br.readLine();
122 }
123 } catch (IOException e) {
124 e.printStackTrace();
125 } finally {
126 try {
127 br.close();
128 stream.close();
129 } catch (IOException e){}
130 }
131 }
132
133
134
135
136
137
138
139
140 public static GrammarRules getInstance() {
141 return INSTANCE;
142 }
143
144
145 public String applyRules (String text) {
146 return reverseRules(text);
147 }
148
149 public String reverseRules(String text) {
150 if (text == null)
151 throw new NullPointerException("Parameter 'text' must not be null.");
152 if (text.length() == 0)
153 return text;
154
155
156 boolean excludeFromCapitilisation = false;
157 for (int iii = 0; iii < CAPITILISATION_EXCLUSIONS.length; iii++) {
158 String exclusion = CAPITILISATION_EXCLUSIONS[iii];
159 if (Pattern.compile(exclusion).matcher(text).find()) {
160 excludeFromCapitilisation = true;
161 break;
162 }
163 }
164 if (!excludeFromCapitilisation)
165 text = doCapitilisation(text);
166 text = doDecapitalisation(text);
167
168 return text;
169 }
170
171 private String doCapitilisation (String text) {
172 for (int i = 0; i < SNEAKY_CAPIT_PATTERNS.length; i++){
173 Matcher matcher = SNEAKY_CAPIT_PATTERNS[i].matcher(text);
174 if (matcher.find()){
175 text = matcher.replaceAll(matcher.group(1) + matcher.group(2).toUpperCase() + matcher.group(3));
176 }
177 }
178 text = text.substring(0,1).toUpperCase() + text.substring(1);
179 text = translate(text, CAPITILISATION_RULES_REVERSED);
180 return text;
181 }
182
183 private String doDecapitalisation(String text) {
184 for (int i = 0; i < DECAPITALIZATION_PATTERNS.size(); i++){
185 String s = (String) DECAPITALIZATION_PATTERNS.get(i);
186 Pattern pattern = Pattern.compile("(^|\\.\\s)(" + s + ")(\\p{Upper})");
187 Matcher matcher = pattern.matcher(text);
188 while (matcher.find()){
189 text = matcher.replaceFirst(matcher.group(1) + matcher.group(2) + matcher.group(3).toLowerCase());
190 matcher = pattern.matcher(text);
191 }
192 }
193 return text;
194 }
195
196
197
198
199
200
201
202
203 private String translate(String text, String[][] rules) {
204 for (int iii = 0; iii < rules.length; iii++) {
205 String[] rule = rules[iii];
206 text = text.replaceAll(rule[0], rule[1]);
207 }
208 return text;
209 }
210 }