1 package uk.ac.ebi.intenz.tools.sib.comparator;
2
3 import org.apache.log4j.Logger;
4 import uk.ac.ebi.intenz.tools.sib.exceptions.EnzymeEntryValidationException;
5 import uk.ac.ebi.intenz.tools.sib.validator.EnzymeEntryValidator;
6 import uk.ac.ebi.intenz.tools.sib.writer.LineType;
7 import uk.ac.ebi.intenz.domain.enzyme.EnzymeCommissionNumber;
8 import uk.ac.ebi.intenz.domain.exceptions.EcException;
9
10 import java.io.*;
11 import java.util.regex.Pattern;
12 import java.util.regex.Matcher;
13 import java.util.List;
14 import java.util.ArrayList;
15
16
17
18
19
20
21
22 public class FlatFileComparator {
23
24 private static final Logger LOGGER =
25 Logger.getLogger(FlatFileComparator.class.getName());
26 private static final Logger LOGGER_ID = Logger.getLogger("ID");
27 private static final Logger LOGGER_DE = Logger.getLogger("DE");
28 private static final Logger LOGGER_AN = Logger.getLogger("AN");
29 private static final Logger LOGGER_CA = Logger.getLogger("CA");
30 private static final Logger LOGGER_CC = Logger.getLogger("CC");
31 private static final Logger LOGGER_CF = Logger.getLogger("CF");
32 private static final Logger LOGGER_DI = Logger.getLogger("DI");
33 private static final Logger LOGGER_DR = Logger.getLogger("DR");
34 private static final Logger LOGGER_PR = Logger.getLogger("PR");
35
36
37
38
39 private static final int START_LINE_NO = 27;
40 private static final String NEW_LINE = "\n";
41
42 private static final String END_OF_ENTRY = new String("\\/\\/.*?");
43
44 private static final String MISSING_ENZYME_ENTRY = new String("MISSING ENZYME ENTRY");
45 private static final String MISSING_INTENZ_ENTRY = new String("MISSING INTENZ ENTRY");
46
47 private static int idCounter = 0;
48 private static int deCounter = 0;
49 private static int anCounter = 0;
50 private static int caCounter = 0;
51 private static int ccCounter = 0;
52 private static int cfCounter = 0;
53 private static int diCounter = 0;
54 private static int prCounter = 0;
55 private static int drCounter = 0;
56
57 public static void compare (File flatFile1, File flatFile2) {
58 BufferedReader flatFile1Reader = null;
59 BufferedReader flatFile2Reader = null;
60 try {
61 flatFile1Reader = new BufferedReader(new FileReader(flatFile1));
62 flatFile2Reader = new BufferedReader(new FileReader(flatFile2));
63
64 int lineNo = 1;
65
66 String lineOfLoadedFF = flatFile1Reader.readLine();
67 lineOfLoadedFF = addNewLine(lineOfLoadedFF);
68 String lineOfGeneratedFF = flatFile2Reader.readLine();
69 lineOfGeneratedFF = addNewLine(lineOfGeneratedFF);
70
71 StringBuffer entryA = new StringBuffer();
72 StringBuffer entryB = new StringBuffer();
73 String missingEntry = "";
74
75 while ( lineOfLoadedFF != null && lineOfGeneratedFF != null ) {
76
77 if ( lineNo > START_LINE_NO ) {
78 if (missingEntry != MISSING_INTENZ_ENTRY) entryA.append(lineOfLoadedFF);
79 if (missingEntry != MISSING_ENZYME_ENTRY) entryB.append(lineOfGeneratedFF);
80 try {
81
82
83
84 if ( !isNotEndOfEntry(lineOfLoadedFF) ) {
85
86 if (missingEntry != MISSING_ENZYME_ENTRY)
87 findEndOfEntry(lineOfGeneratedFF, flatFile2Reader, entryB);
88
89 String outputResult = compareEntries(entryA.toString(), entryB.toString());
90 if ( outputResult.length() != 0 ){
91 missingEntry = checkForMissingEntries(outputResult);
92 LOGGER.info(outputResult);
93 } else {
94 missingEntry = "";
95 }
96 if (missingEntry != MISSING_ENZYME_ENTRY) entryB.delete(0, entryB.length());
97 if (missingEntry != MISSING_INTENZ_ENTRY) entryA.delete(0, entryA.length());
98
99
100
101 } else if ( !isNotEndOfEntry(lineOfGeneratedFF) ) {
102 if (missingEntry != MISSING_INTENZ_ENTRY)
103 findEndOfEntry(lineOfLoadedFF, flatFile1Reader, entryA);
104
105 String outputResult = compareEntries(entryA.toString(), entryB.toString());
106 if ( outputResult.length() != 0 ){
107 missingEntry = checkForMissingEntries(outputResult);
108 LOGGER.info(outputResult);
109 } else {
110 missingEntry = "";
111 }
112 if (missingEntry != MISSING_ENZYME_ENTRY) entryB.delete(0, entryB.length());
113 if (missingEntry != MISSING_INTENZ_ENTRY) entryA.delete(0, entryA.length());
114 }
115 } catch ( EnzymeEntryValidationException e ) {
116 LOGGER.error(e);
117 clearBuffers(entryA, entryB);
118 }
119 }
120
121 lineNo++;
122 if (missingEntry != MISSING_INTENZ_ENTRY) lineOfLoadedFF = nextLine(flatFile1Reader);
123 if (missingEntry != MISSING_ENZYME_ENTRY) lineOfGeneratedFF = nextLine(flatFile2Reader);
124 if ( lineOfLoadedFF.equals("null" + NEW_LINE) || lineOfGeneratedFF.equals("null" + NEW_LINE) )
125 break;
126 }
127
128 logTotals();
129
130 } catch ( IOException e ) {
131 LOGGER.error("Error while reading a flat file.", e);
132 } finally {
133 try {
134 if ( flatFile1Reader != null ) flatFile1Reader.close();
135 if ( flatFile2Reader != null ) flatFile2Reader.close();
136 } catch ( IOException e ) {
137 LOGGER.error("Error while closing a reader.", e);
138 }
139 }
140 }
141
142
143
144
145
146
147
148 private static String checkForMissingEntries(String outputResult) {
149 String missingEntry = "";
150 if (outputResult.startsWith(MISSING_ENZYME_ENTRY)){
151 missingEntry = MISSING_ENZYME_ENTRY;
152 } else if (outputResult.startsWith(MISSING_INTENZ_ENTRY)){
153 missingEntry = MISSING_INTENZ_ENTRY;
154 }
155 return missingEntry;
156 }
157
158 private static String nextLine(BufferedReader reader) throws IOException {
159 String line = reader.readLine();
160 return addNewLine(line);
161 }
162
163
164
165
166
167
168
169
170
171
172 public static String compareEntries (String entryA, String entryB) throws EnzymeEntryValidationException {
173 if ( entryA == null ) throw new NullPointerException("Parameter 'entryA' must not be null.");
174 if ( entryB == null ) throw new NullPointerException("Parameter 'entryB' must not be null.");
175 LOGGER.debug(entryA);
176 LOGGER.debug(entryB);
177 entryA = reconstructSentences(entryA);
178 entryB = reconstructSentences(entryB);
179
180
181 try {
182 EnzymeEntryValidator.validate(entryA);
183 } catch ( EnzymeEntryValidationException e ) {
184 throw new EnzymeEntryValidationException("Error in entryA: "
185 + e.getMessage()
186 + ((entryA.indexOf('\n') > -1)? entryA.substring(0, entryA.indexOf('\n')) : entryA));
187 }
188 try {
189 EnzymeEntryValidator.validate(entryB);
190 } catch ( EnzymeEntryValidationException e ) {
191 throw new EnzymeEntryValidationException("Error in entryB: "
192 + e.getMessage()
193 + ((entryB.indexOf('\n') > -1)? entryB.substring(0, entryB.indexOf('\n')) : entryB));
194 }
195
196 StringBuffer differences = new StringBuffer();
197
198
199 if ( entryA.equals(entryB) )
200 return differences.toString();
201
202 if ( lineIsDifferent(entryA, entryB, LineType.ID) ) {
203
204 String idLine = null;
205 String ecString = null;
206 try {
207 idLine = addNewLine(getLine(entryA, LineType.ID));
208 ecString = getEC(idLine);
209 EnzymeCommissionNumber ecA = EnzymeCommissionNumber.valueOf(ecString);
210 idLine = addNewLine(getLine(entryB, LineType.ID));
211 ecString = getEC(idLine);
212 EnzymeCommissionNumber ecB = EnzymeCommissionNumber.valueOf(ecString);
213 int ecDiff = ecA.compareTo(ecB);
214 if (ecDiff != 0){
215 if (ecDiff < 0){
216 differences.append(MISSING_INTENZ_ENTRY);
217 differences.append(": ");
218 differences.append(ecB.toString());
219 differences.append(" does not exist in ENZYME");
220 } else if (ecDiff > 0){
221 differences.append(MISSING_ENZYME_ENTRY);
222 differences.append(": ");
223 differences.append(ecA.toString());
224 differences.append(" does not exist in IntEnz");
225 }
226 LOGGER_ID.warn(differences.toString());
227 idCounter++;
228 return differences.toString();
229 }
230 } catch (EcException e) {
231 throw new EnzymeEntryValidationException("Bad EC number: " + ecString);
232 }
233
234
235
236
237
238 }
239 if ( lineIsDifferent(entryA, entryB, LineType.DE) ) {
240 String diffString = getDifferenceMessage(LineType.DE, entryA, entryB, LOGGER_DE);
241 if ( diffString != null && !diffString.equals("") ) {
242 deCounter++;
243 differences.append(diffString);
244 }
245 }
246
247 int lineIndex = linesAreDifferent(entryA, entryB, LineType.AN);
248 if ( lineIndex != -1 ) {
249 String diffString = getDifferenceMessage(LineType.AN, entryA, entryB, lineIndex, LOGGER_AN);
250 if ( diffString != null && !diffString.equals("") ) {
251 anCounter++;
252 differences.append(diffString);
253 }
254 }
255
256 lineIndex = linesAreDifferent(entryA, entryB, LineType.CA);
257 if ( lineIndex != -1 ) {
258 String diffString = getDifferenceMessage(LineType.CA, entryA, entryB, lineIndex, LOGGER_CA);
259 if ( diffString != null && !diffString.equals("") ) {
260 caCounter++;
261 differences.append(diffString);
262 }
263 }
264
265 lineIndex = linesAreDifferent(entryA, entryB, LineType.CF);
266 if ( lineIndex != -1 ) {
267 String diffString = getDifferenceMessage(LineType.CF, entryA, entryB, lineIndex, LOGGER_CF);
268 if ( diffString != null && !diffString.equals("") ) {
269 cfCounter++;
270 differences.append(diffString);
271 }
272 }
273
274 lineIndex = linesAreDifferent(entryA, entryB, LineType.CC);
275 if ( lineIndex != -1 ) {
276 String diffString = getDifferenceMessage(LineType.CC, entryA, entryB, lineIndex, LOGGER_CC);
277 if ( diffString != null && !diffString.equals("") ) {
278 ccCounter++;
279 differences.append(diffString);
280 }
281 }
282
283 lineIndex = linesAreDifferent(entryA, entryB, LineType.DI);
284 if ( lineIndex != -1 ) {
285 String diffString = getDifferenceMessage(LineType.DI, entryA, entryB, lineIndex, LOGGER_DI);
286 if ( diffString != null && !diffString.equals("") ) {
287 diCounter++;
288 differences.append(diffString);
289 }
290 }
291
292 lineIndex = linesAreDifferent(entryA, entryB, LineType.PR);
293 if ( lineIndex != -1 ) {
294 String diffString = getDifferenceMessage(LineType.PR, entryA, entryB, lineIndex, LOGGER_PR);
295 if ( diffString != null && !diffString.equals("") ) {
296 prCounter++;
297 differences.append(diffString);
298 }
299
300 }
301
302 lineIndex = linesAreDifferent(entryA, entryB, LineType.DR);
303 if ( lineIndex != -1 ) {
304 String diffString = getDifferenceMessage(LineType.DR, entryA, entryB, lineIndex, LOGGER_DR);
305 if ( diffString != null && !diffString.equals("") ) {
306 drCounter++;
307 differences.append(diffString);
308 }
309
310 }
311
312 if ( differences.length() == 0 || differences.length() == 1 ) {
313
314
315 }
316
317 return differences.toString();
318 }
319
320
321
322
323
324
325
326
327
328
329
330 private static String reconstructSentences (String entry) {
331
332 StringBuffer finalPreParsed = new StringBuffer();
333 StringBuffer hypenParsed = new StringBuffer();
334 Pattern idPattern = Pattern.compile("(ID \\d+?\\.\\d+?\\.\\d+?\\.\\d+?\n)");
335 Matcher idMatcher = idPattern.matcher(entry);
336
337
338 if ( idMatcher.find() ) {
339 finalPreParsed.append(idMatcher.group());
340 entry = entry.substring(finalPreParsed.length(), entry.length());
341 }
342
343
344 Pattern hyphenPattern = Pattern.compile("\\-\n(\\p{Upper}{2}\\s{3,}+)");
345 Matcher hyphenMatcher = hyphenPattern.matcher(entry);
346 boolean foundHyphen = hyphenMatcher.find();
347 if ( foundHyphen ) {
348 String whatFollows = entry.substring(hyphenMatcher.end());
349 if (whatFollows.startsWith("or ")
350 || whatFollows.startsWith("and ")
351 || whatFollows.startsWith("bonds ")){
352 hypenParsed.append(hyphenMatcher.replaceAll("- "));
353 } else {
354 hypenParsed.append(hyphenMatcher.replaceAll("-"));
355 }
356 } else {
357 hypenParsed.append(entry);
358 }
359
360
361
362 removeSpareText(hypenParsed, "(.[^\\.;\\/]|sp\\.|e\\.g\\.|cf\\.)(\n(\\p{Upper}{2}\\s{3,}+))");
363
364 removeSpareText(hypenParsed, "(CA .+?)\nCA ");
365
366 removeSpareText(hypenParsed, "(CC .+?)\nCC ");
367
368 finalPreParsed.append(hypenParsed.toString());
369
370 return finalPreParsed.toString();
371 }
372
373
374
375
376
377
378
379
380 private static void removeSpareText(StringBuffer text, String regex) {
381 Pattern pattern = Pattern.compile(regex);
382 Matcher matcher = pattern.matcher(text.toString());
383 boolean found = matcher.find();
384 while (found) {
385 String subString = matcher.group(0);
386 String replacement = matcher.group(1);
387 text.replace(text.indexOf(subString), text.indexOf(subString)
388 + subString.length(), replacement + " ");
389 found = matcher.find();
390 }
391 }
392
393
394
395
396
397
398
399
400
401
402 private static void findEndOfEntry (String lineOfLoadedFF, BufferedReader flatFile1Reader,
403 StringBuffer entryA) throws IOException {
404 if ( isNotEndOfEntry(lineOfLoadedFF) ) {
405 String line = readLineAndAppend(flatFile1Reader, entryA);
406 while ( isNotEndOfEntry(line) ) {
407 line = readLineAndAppend(flatFile1Reader, entryA);
408 }
409 }
410 }
411
412 private static boolean isNotEndOfEntry (String line) {
413 return (line != null && !line.trim().matches(END_OF_ENTRY));
414 }
415
416
417
418
419
420
421
422
423
424
425 private static String readLineAndAppend (BufferedReader flatFile2Reader, StringBuffer entryB) throws IOException {
426 String lineOfGeneratedFF;
427 lineOfGeneratedFF = flatFile2Reader.readLine();
428 lineOfGeneratedFF = addNewLine(lineOfGeneratedFF);
429 entryB.append(lineOfGeneratedFF);
430 return lineOfGeneratedFF;
431 }
432
433
434
435
436
437
438
439 private static void clearBuffers (StringBuffer entryA, StringBuffer entryB) {
440 entryA.delete(0, entryA.length());
441 entryB.delete(0, entryB.length());
442 }
443
444
445
446
447 private static void logTotals () {
448 int totalCounter = idCounter + deCounter + anCounter + caCounter + ccCounter
449 + cfCounter + diCounter + drCounter + prCounter;
450 LOGGER.info("-------------------------------------------------");
451 LOGGER.info("ID: " + idCounter);
452 LOGGER.info("DE: " + deCounter);
453 LOGGER.info("AN: " + anCounter);
454 LOGGER.info("CA: " + caCounter);
455 LOGGER.info("CC: " + ccCounter);
456 LOGGER.info("CF: " + cfCounter);
457 LOGGER.info("DI: " + diCounter);
458 LOGGER.info("DR: " + drCounter);
459 LOGGER.info("PR: " + prCounter);
460 LOGGER.info("-------------------------------------------------");
461 LOGGER.info("Total Errors: " + totalCounter);
462 LOGGER.info("-------------------------------------------------");
463 }
464
465
466
467
468
469
470 private static String addNewLine (String lineToAdd) {
471 return lineToAdd += NEW_LINE;
472 }
473
474
475
476
477
478
479
480
481
482 private static boolean lineIsDifferent (String entryA, String entryB, LineType lineType) {
483 assert entryA != null : "Parameter 'entryA' must not be null.";
484 assert entryB != null : "Parameter 'entryB' must not be null.";
485 assert lineType != null : "Parameter 'lineType' must not be null.";
486 String lineEntryA = getLine(entryA, lineType);
487 String lineEntryB = getLine(entryB, lineType);
488 return !lineEntryA.equals(lineEntryB);
489 }
490
491
492
493
494
495
496
497
498
499
500 private static int linesAreDifferent (String entryA, String entryB, LineType lineType) {
501 assert entryA != null : "Parameter 'entryA' must not be null.";
502 assert entryB != null : "Parameter 'entryB' must not be null.";
503 assert lineType != null : "Parameter 'lineType' must not be null.";
504 List linesEntryA = getLines(entryA, lineType);
505 List linesEntryB = getLines(entryB, lineType);
506 for ( int lineIndex = 0; lineIndex < linesEntryA.size(); lineIndex++ ) {
507 String lineEntryA = (String) linesEntryA.get(lineIndex);
508 if ( linesEntryB.size() - 1 < lineIndex )
509 return lineIndex + 1;
510 String lineEntryB = (String) linesEntryB.get(lineIndex);
511 if ( !lineEntryA.equals(lineEntryB) )
512 return lineIndex + 1;
513 }
514 return -1;
515 }
516
517
518
519
520
521
522
523
524
525
526 private static String getDifferenceMessage (LineType lineType, String entryA, String entryB, Logger logger) {
527 assert lineType != null : "Parameter 'lineType' must not be null.";
528 assert entryA != null : "Parameter 'entryA' must not be null.";
529 assert entryB != null : "Parameter 'entryB' must not be null.";
530 StringBuffer differences = new StringBuffer();
531 differences.append(lineType.toString());
532 differences.append(" line is different for entry: ");
533 differences.append(getEC(entryA));
534 differences.append(" (EC of ENZYME.dat)");
535 logger.info(differences);
536 StringBuffer enzymeDat = new StringBuffer("ENZYME.DAT: ").append(getLine(entryA, lineType));
537 differences.append(enzymeDat);
538 logger.info(enzymeDat);
539 StringBuffer intenzDat = new StringBuffer("INTENZ.DAT: ").append(getLine(entryB, lineType));
540 differences.append(intenzDat);
541 logger.info(intenzDat);
542 return differences.toString();
543 }
544
545
546
547
548
549
550
551
552
553
554
555
556 private static String getDifferenceMessage (LineType lineType, String entryA, String entryB, int lineIndex, Logger logger) {
557 assert lineType != null : "Parameter 'lineType' must not be null.";
558 assert entryA != null : "Parameter 'entryA' must not be null.";
559 StringBuffer differences = new StringBuffer();
560 differences.append(lineType.toString());
561 differences.append(" line ");
562 differences.append(lineIndex);
563 differences.append(" is different for entry: ");
564 differences.append(getEC(entryA));
565 differences.append(" (EC of enzyme.dat)");
566 logger.info(differences.toString());
567 StringBuffer enzymeDat = new StringBuffer("ENZYME.DAT: ").append(getLineOnLineIndex(entryA, lineType, lineIndex));
568 differences.append(enzymeDat);
569 logger.info(enzymeDat);
570 StringBuffer intenzDat = new StringBuffer("INTENZ.DAT: ").append(getLineOnLineIndex(entryB, lineType, lineIndex));
571 differences.append(intenzDat);
572 logger.info(intenzDat);
573 return differences.toString();
574 }
575
576
577
578
579
580
581
582 private static String getEC (String entryA) {
583 assert entryA != null : "Parameter 'entryA' must not be null.";
584 Pattern IDLinePattern = Pattern.compile("ID (\\d+?\\.\\d+?\\.\\d+?\\.\\d+?)\n");
585 Matcher IDLinePatternMatcher = IDLinePattern.matcher(entryA);
586 if ( IDLinePatternMatcher.find() ) return IDLinePatternMatcher.group(1);
587 return "";
588 }
589
590
591
592
593
594
595
596
597 private static String getLine (String entry, LineType lineType) {
598 assert entry != null : "Parameter 'entry' must not be null.";
599 Pattern linePattern = Pattern.compile("(" + lineType.toString() + " .+?)\n");
600 Matcher linePatternMatcher = linePattern.matcher(entry);
601 if ( linePatternMatcher.find() ) return linePatternMatcher.group(1);
602 return "";
603 }
604
605
606
607
608
609
610
611
612 private static List getLines (String entry, LineType lineType) {
613 assert entry != null : "Parameter 'entry' must not be null.";
614 assert lineType != null : "Parameter 'lineType' must not be null.";
615 Pattern linePattern = Pattern.compile("(" + lineType.toString() + " .+?)\n");
616 Matcher linePatternMatcher = linePattern.matcher(entry);
617 List lines = new ArrayList();
618 while ( linePatternMatcher.find() ) lines.add(linePatternMatcher.group(1));
619 return lines;
620 }
621
622
623
624
625
626
627
628
629
630
631
632 private static String getLineOnLineIndex (String entry, LineType lineType, int lineIndex) {
633 --lineIndex;
634 List lines = getLines(entry, lineType);
635 if ( lines.size() == 0 )
636 return new String("");
637 else if ( lineIndex < lines.size() )
638 return (String) lines.get(lineIndex);
639 else
640 return (String) lines.get(lines.size() - 1);
641 }
642
643 }