1
2
3
4
5
6 package uk.ac.ebi.intenz.tools.sib.translator.rules;
7
8 import java.io.BufferedReader;
9 import java.io.IOException;
10 import java.io.InputStream;
11 import java.io.InputStreamReader;
12 import java.util.ArrayList;
13 import java.util.regex.Matcher;
14 import java.util.regex.Pattern;
15
16 import org.apache.log4j.Logger;
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31 public class GrammarRules implements RuleGroup {
32
33 private static final Logger LOGGER = Logger.getLogger(GrammarRules.class);
34
35 private static GrammarRules INSTANCE = new GrammarRules();
36
37 private final String[][] CAPITILISATION_RULES_REVERSED = {
38 {"^TRNA", "tRNA"},
39 {"^MRNA", "mRNA"},
40 {"^RRNA", "rRNA"},
41 {"^CDNA", "cDNA"},
42 {"^DAMP", "dAMP"},
43 {"^DADP", "dADP"},
44 {"^DATP", "dATP"},
45 {"^DUMP", "dUMP"},
46 {"^DUDP", "dUDP"},
47 {"^DUTP", "dUTP"},
48 {"^DCMP", "dCMP"},
49 {"^DCDP", "dCDP"},
50 {"^DCTP", "dCTP"},
51 {"^DGMP", "dGMP"},
52 {"^DGDP", "dGDP"},
53 {"^DGTP", "dGTP"},
54 {"^DTMP", "dTMP"},
55 {"^DTDP", "dTDP"},
56 {"^DTTP", "dTTP"},
57 {"^CAMP", "cAMP"},
58 {"^CGMP", "cGMP"}
59
60 };
61
62
63 private final String[] CAPITILISATION_EXCLUSIONS = {
64 "^p\\-",
65 "^m\\-",
66 "^o\\-",
67 "^n\\-",
68 "^tRNA",
69 "^tRNase",
70 "^mRNA",
71 "^rRNA",
72 "^cDNA",
73 "^dAMP",
74 "^dADP",
75 "^dATP",
76 "^dUMP",
77 "^dUDP",
78 "^dUTP",
79 "^dCMP",
80 "^dCDP",
81 "^dCTP",
82 "^dGMP",
83 "^dGDP",
84 "^dGTP",
85 "^dTMP",
86 "^dTDP",
87 "^dTTP",
88 "^dIDP",
89 "^dITP",
90 "^cAMP",
91 "^cGMP",
92 "^ppGpp",
93 "^pppGpp",
94 "^\\(ppGpp\\)",
95 "^\\(fMet\\)",
96 "^fMet",
97 "^n\\s",
98 "^m\\s",
99 "^tPA",
100 "^(\\[|\\(|\\s)eIF",
101 "^sn-",
102 "^\\d*?\\([\\+\\-\\d]+?\\)\\-",
103 "^cd-"
104 };
105
106 private final Pattern[] SNEAKY_CAPIT_PATTERNS = {
107 Pattern.compile("^(\\[|\\()(\\p{Lower})(\\p{Lower})")
108 };
109
110
111 private final Pattern SNEAKY_CAPIT_PATTERN = Pattern.compile("^(\\[|\\()(\\w)");
112
113 private final ArrayList DECAPITALIZATION_PATTERNS = new ArrayList();
114
115 private GrammarRules () {
116 InputStream stream = this.getClass().getClassLoader().getResourceAsStream("decapitalization_patterns.txt");
117 BufferedReader br = new BufferedReader(new InputStreamReader(stream));
118 String pattern;
119 try {
120 pattern = br.readLine();
121 while (pattern != null){
122 DECAPITALIZATION_PATTERNS.add(pattern);
123 pattern = br.readLine();
124 }
125 } catch (IOException e) {
126 e.printStackTrace();
127 } finally {
128 try {
129 br.close();
130 stream.close();
131 } catch (IOException e){}
132 }
133 }
134
135
136
137
138
139
140
141
142 public static GrammarRules getInstance() {
143 return INSTANCE;
144 }
145
146
147 public String applyRules (String text) {
148 return reverseRules(text);
149 }
150
151 public String reverseRules(String text) {
152 if (text == null)
153 throw new NullPointerException("Parameter 'text' must not be null.");
154 if (text.length() == 0)
155 return text;
156
157
158 boolean excludeFromCapitilisation = false;
159 for (int iii = 0; iii < CAPITILISATION_EXCLUSIONS.length; iii++) {
160 String exclusion = CAPITILISATION_EXCLUSIONS[iii];
161 if (Pattern.compile(exclusion).matcher(text).find()) {
162 excludeFromCapitilisation = true;
163 break;
164 }
165 }
166 if (!excludeFromCapitilisation)
167 text = doCapitilisation(text);
168 text = doDecapitalisation(text);
169
170 return text;
171 }
172
173 private String doCapitilisation (String text) {
174 for (int i = 0; i < SNEAKY_CAPIT_PATTERNS.length; i++){
175 Matcher matcher = SNEAKY_CAPIT_PATTERNS[i].matcher(text);
176 if (matcher.find()){
177 text = matcher.replaceAll(matcher.group(1) + matcher.group(2).toUpperCase() + matcher.group(3));
178 }
179 }
180 text = text.substring(0,1).toUpperCase() + text.substring(1);
181 text = translate(text, CAPITILISATION_RULES_REVERSED);
182 return text;
183 }
184
185 private String doDecapitalisation(String text) {
186 for (int i = 0; i < DECAPITALIZATION_PATTERNS.size(); i++){
187 String s = (String) DECAPITALIZATION_PATTERNS.get(i);
188 Pattern pattern = Pattern.compile("(^|\\.\\s)(" + s + ")(\\p{Upper})");
189 Matcher matcher = pattern.matcher(text);
190 while (matcher.find()){
191 text = matcher.replaceFirst(matcher.group(1) + matcher.group(2) + matcher.group(3).toLowerCase());
192 matcher = pattern.matcher(text);
193 }
194 }
195 return text;
196 }
197
198
199
200
201
202
203
204
205 private String translate(String text, String[][] rules) {
206 for (int iii = 0; iii < rules.length; iii++) {
207 String[] rule = rules[iii];
208 text = text.replaceAll(rule[0], rule[1]);
209 }
210 return text;
211 }
212 }