View Javadoc

1   package uk.ac.ebi.intenz.tools.sib.validator;
2   
3   import org.apache.log4j.Logger;
4   
5   import java.io.BufferedReader;
6   import java.io.StringReader;
7   import java.io.IOException;
8   import java.util.regex.Pattern;
9   import java.util.regex.Matcher;
10  
11  import uk.ac.ebi.intenz.tools.sib.writer.LineType;
12  import uk.ac.ebi.intenz.tools.sib.exceptions.EnzymeEntryValidationException;
13  
14  /**
15   * This class provides a method to validate ENZYME entries.
16   *
17   * @author Michael Darsow
18   * @version $Revision: 1.2 $ $Date: 2008/01/28 11:43:23 $
19   */
20  public class EnzymeEntryValidator {
21  
22    private static final Logger LOGGER =
23  	  Logger.getLogger(EnzymeEntryValidator.class.getName());
24  
25    private static final String ID_LINE_CONTENT_REGEXP = "\\d+?\\.\\d+?\\.\\d+?\\.\\d+?\n";
26    private static final String DE_AN_CF_LINE_CONTENT_REGEXP = ".+?\\.\n";
27    private static final String CA_LINE_CONTENT_REGEXP = ".+?\n";
28    private static final String LAST_CA_LINE_CONTENT_REGEXP = ".+?\\.\n";
29    private static final String CC_SINGLE_LINE_CONTENT_REGEXP = "CC   \\-\\!\\- .+?(\\.|http\\:\\/\\/\\S+?\\/)\n";
30    private static final String CC_TWO_LINE_CONTENT_REGEXP = "CC   \\-\\!\\- .+?\\\n" +
31                                                             "CC       .+?(\\.|http\\:\\/\\/\\S+?\\/)\n";
32    private static final String CC_MULTI_LINE_CONTENT_REGEXP = "CC   \\-\\!\\- .+?\n" +
33                                                               "(?:CC       .+?\n)+?" +
34                                                               "CC       .+?(\\.|http\\:\\/\\/\\S+?\\/)\n";
35    private static final String DI_LINE_CONTENT_REGEXP = ".+?\\; MIM\\:\\d+?\\.\n";
36    private static final String PR_LINE_CONTENT_REGEXP = "PROSITE\\; PDOC\\d+?\\;\n";
37    private static final String DR_LINE_SINGLE_XREF_REGEXP = "(?:.+?\\,\\s\\w{1,5}\\_\\w{1,5}\\s*?\\;){1}\n";
38    private static final String DR_LINE_DOUBLE_XREF_REGEXP = "(?:.+?\\,\\s\\w{1,5}\\_\\w{1,5}\\s*?\\;){2}\n";
39    private static final String DR_LINE_TRIPLE_XREF_REGEXP = "(?:.+?\\,\\s\\w{1,5}\\_\\w{1,5}\\s*?\\;){3}\n";
40  
41    public static boolean validate(String entry) throws EnzymeEntryValidationException {
42      if (entry == null) throw new NullPointerException("Parameter 'entry' must not be null.");
43      if (entry.equals("")) throw new EnzymeEntryValidationException("The given entry is empty.");
44  
45      BufferedReader entryReader = new BufferedReader(new StringReader(entry));
46      try {
47        StringBuffer CCLineStringBuffer = new StringBuffer();
48        boolean CCLinesProcessed = false;
49        String line = entryReader.readLine() + "\n";
50        while (line != null) {
51          if (line.equals("//\n")) break; // end of entry reached
52          Pattern lineTypePattern = Pattern.compile("(ID|DE|AN|CA|CF|CC|DI|PR|DR)   (.+?\n)");
53          Matcher lineTypePatternMatcher = lineTypePattern.matcher(line);
54          if (lineTypePatternMatcher.find()) {
55            String lineType = lineTypePatternMatcher.group(1);
56  
57            if (lineType.equals(LineType.ID.toString()) &&
58                !lineTypePatternMatcher.group(2).matches(ID_LINE_CONTENT_REGEXP))
59              throw new EnzymeEntryValidationException("The ID line does not contain a valid EC number.");
60  
61            if (lineType.equals(LineType.DE.toString()) &&
62                !lineTypePatternMatcher.group(2).matches(DE_AN_CF_LINE_CONTENT_REGEXP))
63              throw new EnzymeEntryValidationException("The DE line does not contain valid content.");
64  
65            if (lineType.equals(LineType.AN.toString()) &&
66                !lineTypePatternMatcher.group(2).matches(DE_AN_CF_LINE_CONTENT_REGEXP))
67              throw new EnzymeEntryValidationException("The AN line does not contain valid content.");
68  
69            if (isLastCALine(line, entry)) {
70              if (lineType.equals(LineType.CA.toString()) &&
71                  !lineTypePatternMatcher.group(2).matches(LAST_CA_LINE_CONTENT_REGEXP))
72                throw new EnzymeEntryValidationException("The CA line does not contain valid content.");
73            } else {
74              if (lineType.equals(LineType.CA.toString()) &&
75                  !lineTypePatternMatcher.group(2).matches(CA_LINE_CONTENT_REGEXP))
76                throw new EnzymeEntryValidationException("The CA line does not contain valid content.");
77            }
78  
79            if (lineType.equals(LineType.CF.toString()) &&
80                !lineTypePatternMatcher.group(2).matches(DE_AN_CF_LINE_CONTENT_REGEXP))
81              throw new EnzymeEntryValidationException("The CF line does not contain valid content.");
82  
83            // Checks every sentence (pattern: CC   -!- .+?.\n) for validity.
84            if (lineType.equals(LineType.CC.toString()) && !CCLinesProcessed) {
85              CCLineStringBuffer.append(line); // store line
86              line = entryReader.readLine(); // read next line
87              if (line != null) line += "\n";
88  
89              if (line != null && line.matches("CC.*?\n")) { // if the next line is a CC line
90                if (line.matches("CC   \\-\\!\\-.*?\n")) { // check if this line is the beginning of a new sentence
91                  if (!CCLineStringBuffer.toString().matches(CC_SINGLE_LINE_CONTENT_REGEXP) &&
92                      !CCLineStringBuffer.toString().matches(CC_TWO_LINE_CONTENT_REGEXP) &&
93                      !CCLineStringBuffer.toString().matches(CC_MULTI_LINE_CONTENT_REGEXP)) // check the sentence stored in the buffer for validity
94                    throw new EnzymeEntryValidationException("The comment does not comply to the ENZYME comment format.");
95                  CCLineStringBuffer = new StringBuffer(); // empty StringBuffer for next sentence
96  //                CCLineStringBuffer.append(line); // add beginning of next sentence to the empty StringBuffer
97                }
98              } else { // check last sentence
99                if (!CCLineStringBuffer.toString().matches(CC_SINGLE_LINE_CONTENT_REGEXP) &&
100                   !CCLineStringBuffer.toString().matches(CC_TWO_LINE_CONTENT_REGEXP) &&
101                   !CCLineStringBuffer.toString().matches(CC_MULTI_LINE_CONTENT_REGEXP))
102                 throw new EnzymeEntryValidationException("The comment does not comply to the ENZYME comment format.");
103               CCLinesProcessed = true;
104             }
105             continue; // new line has been acquired already
106           }
107 
108           if (lineType.equals(LineType.DI.toString()) &&
109               !lineTypePatternMatcher.group(2).matches(DI_LINE_CONTENT_REGEXP))
110             throw new EnzymeEntryValidationException("The DI line does not contain valid content.");
111 
112           if (lineType.equals(LineType.PR.toString()) &&
113               !lineTypePatternMatcher.group(2).matches(PR_LINE_CONTENT_REGEXP))
114             throw new EnzymeEntryValidationException("The PR line does not contain valid content.");
115 
116           if (lineType.equals(LineType.DR.toString())) {
117             switch (getNumberOfXrefs(lineTypePatternMatcher.group(2))) {
118               case 1:
119                 if (!lineTypePatternMatcher.group(2).matches(DR_LINE_SINGLE_XREF_REGEXP))
120                   throw new EnzymeEntryValidationException("The DR line does not contain valid content.");
121                 break;
122               case 2:
123                 if (!lineTypePatternMatcher.group(2).matches(DR_LINE_DOUBLE_XREF_REGEXP))
124                   throw new EnzymeEntryValidationException("The DR line does not contain valid content.");
125                 break;
126               case 3:
127                 if (!lineTypePatternMatcher.group(2).matches(DR_LINE_TRIPLE_XREF_REGEXP))
128                   throw new EnzymeEntryValidationException("The DR line does not contain valid content.");
129                 break;
130               default :
131                 throw new EnzymeEntryValidationException("The DR line does not contain valid content.");
132             }
133           }
134 
135         } else {
136           throw new EnzymeEntryValidationException("No valid line type found.");
137         }
138 
139         line = entryReader.readLine();
140         if (line != null) line += "\n";
141       }
142     } catch (IOException e) {
143       LOGGER.error("Error while reading a line of an entry.", e);
144     } finally {
145       try {
146         entryReader.close();
147       } catch (IOException e) {
148         LOGGER.error("Error while closing the reader.", e);
149       }
150     }
151 
152     return true;
153   }
154 
155   /**
156    * Checks whether the given CA line is the last CA line of this entry.
157    *
158    * @param line  The CA line to be checked.
159    * @param entry The current entry being validated.
160    * @return <code>true</code> if the given CA line is the last CA line of this entry.
161    */
162   private static boolean isLastCALine(String line, String entry) {
163     assert line != null : "Parameter 'line' must not be null.";
164     assert entry != null : "Parameter 'entry' must not be null.";
165     Pattern CALinePattern = Pattern.compile("(CA.*?\n)");
166     Matcher CALinePatternMatcher = CALinePattern.matcher(entry);
167     boolean foundLine = false;
168     while (CALinePatternMatcher.find()) {
169       if (foundLine) return false;
170       if (CALinePatternMatcher.group(1).equals(line)) foundLine = true;
171     }
172     return true;
173   }
174 
175   /**
176    * Checks how many cross-references might be contained in the given DR line.
177    *
178    * @param DRLineContent The DR line to be checked.
179    * @return the number of cross-reference occurrences
180    */
181   private static int getNumberOfXrefs(String DRLineContent) {
182     assert DRLineContent != null : "Parameter 'DRLineContent' must not be null.";
183     Pattern DRXrefPattern = Pattern.compile(".+?\\;");
184     Matcher DRXrefPatternMatcher = DRXrefPattern.matcher(DRLineContent);
185     int count = 0;
186     while (DRXrefPatternMatcher.find()) {
187       count++;
188     }
189     return count;
190   }
191 
192 }