1 package uk.ac.ebi.intenz.tools.sib.validator;
2
3 import org.apache.log4j.Logger;
4
5 import java.io.BufferedReader;
6 import java.io.StringReader;
7 import java.io.IOException;
8 import java.util.regex.Pattern;
9 import java.util.regex.Matcher;
10
11 import uk.ac.ebi.intenz.tools.sib.writer.LineType;
12 import uk.ac.ebi.intenz.tools.sib.exceptions.EnzymeEntryValidationException;
13
14
15
16
17
18
19
20 public class EnzymeEntryValidator {
21
22 private static final Logger LOGGER =
23 Logger.getLogger(EnzymeEntryValidator.class.getName());
24
25 private static final String ID_LINE_CONTENT_REGEXP = "\\d+?\\.\\d+?\\.\\d+?\\.\\d+?\n";
26 private static final String DE_AN_CF_LINE_CONTENT_REGEXP = ".+?\\.\n";
27 private static final String CA_LINE_CONTENT_REGEXP = ".+?\n";
28 private static final String LAST_CA_LINE_CONTENT_REGEXP = ".+?\\.\n";
29 private static final String CC_SINGLE_LINE_CONTENT_REGEXP = "CC \\-\\!\\- .+?(\\.|http\\:\\/\\/\\S+?\\/)\n";
30 private static final String CC_TWO_LINE_CONTENT_REGEXP = "CC \\-\\!\\- .+?\\\n" +
31 "CC .+?(\\.|http\\:\\/\\/\\S+?\\/)\n";
32 private static final String CC_MULTI_LINE_CONTENT_REGEXP = "CC \\-\\!\\- .+?\n" +
33 "(?:CC .+?\n)+?" +
34 "CC .+?(\\.|http\\:\\/\\/\\S+?\\/)\n";
35 private static final String DI_LINE_CONTENT_REGEXP = ".+?\\; MIM\\:\\d+?\\.\n";
36 private static final String PR_LINE_CONTENT_REGEXP = "PROSITE\\; PDOC\\d+?\\;\n";
37 private static final String DR_LINE_SINGLE_XREF_REGEXP = "(?:.+?\\,\\s\\w{1,5}\\_\\w{1,5}\\s*?\\;){1}\n";
38 private static final String DR_LINE_DOUBLE_XREF_REGEXP = "(?:.+?\\,\\s\\w{1,5}\\_\\w{1,5}\\s*?\\;){2}\n";
39 private static final String DR_LINE_TRIPLE_XREF_REGEXP = "(?:.+?\\,\\s\\w{1,5}\\_\\w{1,5}\\s*?\\;){3}\n";
40
41 public static boolean validate(String entry) throws EnzymeEntryValidationException {
42 if (entry == null) throw new NullPointerException("Parameter 'entry' must not be null.");
43 if (entry.equals("")) throw new EnzymeEntryValidationException("The given entry is empty.");
44
45 BufferedReader entryReader = new BufferedReader(new StringReader(entry));
46 try {
47 StringBuffer CCLineStringBuffer = new StringBuffer();
48 boolean CCLinesProcessed = false;
49 String line = entryReader.readLine() + "\n";
50 while (line != null) {
51 if (line.equals("//\n")) break; // end of entry reached
52 Pattern lineTypePattern = Pattern.compile("(ID|DE|AN|CA|CF|CC|DI|PR|DR) (.+?\n)");
53 Matcher lineTypePatternMatcher = lineTypePattern.matcher(line);
54 if (lineTypePatternMatcher.find()) {
55 String lineType = lineTypePatternMatcher.group(1);
56
57 if (lineType.equals(LineType.ID.toString()) &&
58 !lineTypePatternMatcher.group(2).matches(ID_LINE_CONTENT_REGEXP))
59 throw new EnzymeEntryValidationException("The ID line does not contain a valid EC number.");
60
61 if (lineType.equals(LineType.DE.toString()) &&
62 !lineTypePatternMatcher.group(2).matches(DE_AN_CF_LINE_CONTENT_REGEXP))
63 throw new EnzymeEntryValidationException("The DE line does not contain valid content.");
64
65 if (lineType.equals(LineType.AN.toString()) &&
66 !lineTypePatternMatcher.group(2).matches(DE_AN_CF_LINE_CONTENT_REGEXP))
67 throw new EnzymeEntryValidationException("The AN line does not contain valid content.");
68
69 if (isLastCALine(line, entry)) {
70 if (lineType.equals(LineType.CA.toString()) &&
71 !lineTypePatternMatcher.group(2).matches(LAST_CA_LINE_CONTENT_REGEXP))
72 throw new EnzymeEntryValidationException("The CA line does not contain valid content.");
73 } else {
74 if (lineType.equals(LineType.CA.toString()) &&
75 !lineTypePatternMatcher.group(2).matches(CA_LINE_CONTENT_REGEXP))
76 throw new EnzymeEntryValidationException("The CA line does not contain valid content.");
77 }
78
79 if (lineType.equals(LineType.CF.toString()) &&
80 !lineTypePatternMatcher.group(2).matches(DE_AN_CF_LINE_CONTENT_REGEXP))
81 throw new EnzymeEntryValidationException("The CF line does not contain valid content.");
82
83
84 if (lineType.equals(LineType.CC.toString()) && !CCLinesProcessed) {
85 CCLineStringBuffer.append(line);
86 line = entryReader.readLine();
87 if (line != null) line += "\n";
88
89 if (line != null && line.matches("CC.*?\n")) {
90 if (line.matches("CC \\-\\!\\-.*?\n")) {
91 if (!CCLineStringBuffer.toString().matches(CC_SINGLE_LINE_CONTENT_REGEXP) &&
92 !CCLineStringBuffer.toString().matches(CC_TWO_LINE_CONTENT_REGEXP) &&
93 !CCLineStringBuffer.toString().matches(CC_MULTI_LINE_CONTENT_REGEXP))
94 throw new EnzymeEntryValidationException("The comment does not comply to the ENZYME comment format.");
95 CCLineStringBuffer = new StringBuffer();
96
97 }
98 } else {
99 if (!CCLineStringBuffer.toString().matches(CC_SINGLE_LINE_CONTENT_REGEXP) &&
100 !CCLineStringBuffer.toString().matches(CC_TWO_LINE_CONTENT_REGEXP) &&
101 !CCLineStringBuffer.toString().matches(CC_MULTI_LINE_CONTENT_REGEXP))
102 throw new EnzymeEntryValidationException("The comment does not comply to the ENZYME comment format.");
103 CCLinesProcessed = true;
104 }
105 continue;
106 }
107
108 if (lineType.equals(LineType.DI.toString()) &&
109 !lineTypePatternMatcher.group(2).matches(DI_LINE_CONTENT_REGEXP))
110 throw new EnzymeEntryValidationException("The DI line does not contain valid content.");
111
112 if (lineType.equals(LineType.PR.toString()) &&
113 !lineTypePatternMatcher.group(2).matches(PR_LINE_CONTENT_REGEXP))
114 throw new EnzymeEntryValidationException("The PR line does not contain valid content.");
115
116 if (lineType.equals(LineType.DR.toString())) {
117 switch (getNumberOfXrefs(lineTypePatternMatcher.group(2))) {
118 case 1:
119 if (!lineTypePatternMatcher.group(2).matches(DR_LINE_SINGLE_XREF_REGEXP))
120 throw new EnzymeEntryValidationException("The DR line does not contain valid content.");
121 break;
122 case 2:
123 if (!lineTypePatternMatcher.group(2).matches(DR_LINE_DOUBLE_XREF_REGEXP))
124 throw new EnzymeEntryValidationException("The DR line does not contain valid content.");
125 break;
126 case 3:
127 if (!lineTypePatternMatcher.group(2).matches(DR_LINE_TRIPLE_XREF_REGEXP))
128 throw new EnzymeEntryValidationException("The DR line does not contain valid content.");
129 break;
130 default :
131 throw new EnzymeEntryValidationException("The DR line does not contain valid content.");
132 }
133 }
134
135 } else {
136 throw new EnzymeEntryValidationException("No valid line type found.");
137 }
138
139 line = entryReader.readLine();
140 if (line != null) line += "\n";
141 }
142 } catch (IOException e) {
143 LOGGER.error("Error while reading a line of an entry.", e);
144 } finally {
145 try {
146 entryReader.close();
147 } catch (IOException e) {
148 LOGGER.error("Error while closing the reader.", e);
149 }
150 }
151
152 return true;
153 }
154
155
156
157
158
159
160
161
162 private static boolean isLastCALine(String line, String entry) {
163 assert line != null : "Parameter 'line' must not be null.";
164 assert entry != null : "Parameter 'entry' must not be null.";
165 Pattern CALinePattern = Pattern.compile("(CA.*?\n)");
166 Matcher CALinePatternMatcher = CALinePattern.matcher(entry);
167 boolean foundLine = false;
168 while (CALinePatternMatcher.find()) {
169 if (foundLine) return false;
170 if (CALinePatternMatcher.group(1).equals(line)) foundLine = true;
171 }
172 return true;
173 }
174
175
176
177
178
179
180
181 private static int getNumberOfXrefs(String DRLineContent) {
182 assert DRLineContent != null : "Parameter 'DRLineContent' must not be null.";
183 Pattern DRXrefPattern = Pattern.compile(".+?\\;");
184 Matcher DRXrefPatternMatcher = DRXrefPattern.matcher(DRLineContent);
185 int count = 0;
186 while (DRXrefPatternMatcher.find()) {
187 count++;
188 }
189 return count;
190 }
191
192 }