View Javadoc

1   package uk.ac.ebi.intenz.tools.sib.writer;
2   
3   import java.util.ArrayList;
4   import java.util.List;
5   import java.util.regex.Matcher;
6   import java.util.regex.Pattern;
7   
8   /**
9    * Handles the special needs regarding the format of the <code><b>CC</b></code> line.
10   * <p/>
11   * The <code><b>CC</b></code> line contains of sentences which are introduced by <code><b>-!-</b></code>. In some
12   * cases not all sentences are preceeded by this string, especially when it comes to ordered lists. These lists also
13   * have to be indented as shown in the example below.
14   * <p/>
15   * <b>Example:</b>
16   * <p/>
17   * <img src="../../../../../../../images/example_cc_formatter.gif">
18   *
19   * @author Michael Darsow
20   * @version $Revision: 1.2 $ $Date: 2008/01/28 11:43:23 $
21   */
22  public class CC_LineFormatter extends DefaultLineFormatter {
23    /**
24     * Formats the <code><b>CC</b></code> lines according to the format described in the class description.
25     *
26     * @param text The text of the <code><b>CC</b></code> line(s).
27     * @param lineType Must always be {@link LineType#CC LineType.CC}.
28     * @return the formatted <code><b>CC</b></code> line(s).
29     * @throws EnzymeFlatFileWriteException if an error occured during this process.
30     * @throws NullPointerException if any of the parameters is <code>null</code>.
31     * @throws IllegalArgumentException if <code>lineType</code> is not {@link LineType#CC LineType.CC}.
32     */
33    public String formatLines(String text, LineType lineType) throws EnzymeFlatFileWriteException {
34      if (text == null || lineType == null) throw new NullPointerException();
35      if (lineType != LineType.CC) throw new IllegalArgumentException();
36  
37      // Every sentence within the CC lines will be wrapped individually and starts with '-!-'.
38      List<String> commentSentences = getCommentSentences(text);
39      StringBuilder CCContent = new StringBuilder();
40      boolean containsOrderedListWithIndent = text.matches(".*?\\:\\s+?\\(\\d+\\)\\s+.+?");
41        for (Object commentSentence : commentSentences) {
42            String sentence = (String) commentSentence;
43            CCContent.append(wrapCCLine(sentence, containsOrderedListWithIndent));
44        }
45  
46      // Comments ending with '/' do not have a period in the end (e.g. URLs).
47      String ccLines = CCContent.toString();
48      if (CCContent.indexOf("/.") > -1)
49        return CCContent.replace(CCContent.indexOf("/."), CCContent.indexOf("/.") + 2, "/").toString();
50  
51      return ccLines;
52    }
53  
54    /**
55     * Inserts line breaks for a sentence in the CC line.
56     * <p/>
57     * Every sentence in the CC line is preceeded by <code>-!-</code> .
58     *
59     * @param text      The sentence to be wrapped.
60     * @param addIndent <code>true</code> if the current sentence contains an ordered list and needs additional indent.
61     * @return the wrapped sentence.
62     * @throws EnzymeFlatFileWriteException if an error occurred during the line breaking process.
63     */
64    private String wrapCCLine(String text, boolean addIndent) throws EnzymeFlatFileWriteException {
65      assert text != null : text;
66  
67      String lineStart = "CC   ";
68      int netLineWidth = LINEWIDTH - (lineStart.length() + 4); // Minus length '-!- '.
69      StringBuilder wrappedText = new StringBuilder();
70      if (text.length() <= netLineWidth) {
71        wrappedText.append(lineStart);
72        wrappedText.append("-!- ");
73        wrappedText.append(text);
74        wrappedText.append("\n");
75        return wrappedText.toString();
76      }
77  
78      StringBuilder restText = new StringBuilder(text);
79      wrappedText.append(lineStart);
80      wrappedText.append("-!- ");
81  
82      boolean orderedListIndicatorFound = false;
83      boolean foundBeginningOfItem = false;
84      String indent = "";
85      LineWrapper lineWrapPositioner = LineWrapperFactory.create(text, LineType.CC);
86      while (restText.toString().trim().length() > netLineWidth) {
87        int position;
88        if (restText.charAt(0) == ' ') restText.deleteCharAt(0);
89        // Calculate correct net line width in case of an ordered list with additional indent.
90        if (orderedListIndicatorFound) {
91          if (restText.toString().trim().matches("\\(\\d+\\).+?")) {
92            position = lineWrapPositioner.findPosition(restText.toString().trim(), netLineWidth);
93            foundBeginningOfItem = true;
94          } else {
95            position = lineWrapPositioner.findPosition(restText.toString().trim(), netLineWidth - indent.length());
96            foundBeginningOfItem = false;
97          }
98        } else {
99          position = lineWrapPositioner.findPosition(restText.toString().trim(), netLineWidth);
100       }
101 
102 
103       String line = restText.substring(0, position).trim(); // Ignore leading space.
104 
105       // Check if additional indent has to be added and retrieve indent.
106       if (addIndent && line.trim().endsWith(":")) orderedListIndicatorFound = true;
107       if (foundBeginningOfItem)
108         indent = getIndent(line);
109 
110       restText.delete(0, position);
111       // Add indent if necessary.
112       if (orderedListIndicatorFound && !foundBeginningOfItem) {
113         wrappedText.append(indent);
114       }
115 
116       wrappedText.append(line);
117       wrappedText.append("\n");
118       wrappedText.append(lineStart);
119       wrappedText.append("    ");
120     }
121 //    if (restText.charAt(0) == ' ') restText.deleteCharAt(0); // Ignore leading space.
122     String tail = restText.toString().trim();
123 //    if (orderedListIndicatorFound && !tail.matches("\\(\\d+\\).+?")) {
124 //      wrappedText.append(indent);
125 //    }
126     wrappedText.append(tail);
127     wrappedText.append("\n");
128 
129     return wrappedText.toString();
130   }
131 
132   /**
133    * Breaks the comment text into individual sentences.
134    *
135    * @param commentText The comment text.
136    * @return a list of sentences.
137    */
138   public List<String> getCommentSentences(String commentText) {
139     assert commentText != null : commentText;
140 
141     List<String> sentences = new ArrayList<String>();
142     final String sentenceDelimiterPattern = "(.*?\\.\\s|.*?http\\:\\/\\/\\S+?\\/\\s)";
143 
144     // Exceptions which do NOT indicate the end of a sentence.
145     // First item is the 'false' end of sentence
146     // Second item is the acceptable regexp for the rest of the sentence
147     final String[][] nonSentenceDelimiters = {
148       {"C. ", "\\p{Lower}.*?"},
149       {"E. ", ".*?"},
150       {"L. ", "\\(.*"},
151       {"(cf. ", ".*?"},
152       {"Cf. ", ".*?"},
153       {"cf. ", ".*?"},
154       {"i.e. ", ".*?"},
155       {"i.e., ", ".*?"},
156       {"i.e.", ".*?"},
157       {"i.e.,", ".*?"},
158       {"e.g. ", ".*?"},
159       {"e.g., ", ".*?"},
160       {"e.g.", ".*?"},
161       {"e.g.,", ".*?"},
162       {"etc. ", ".*?"},
163       {"etc., ", ".*?"},
164       {"etc.", ".*?"},
165       {"etc.,", ".*?"},
166       {"sp. ", "(\\p{Lower}|\\p{Upper}+[ -]?\\p{Digit}+|OxB-1|YAA|PCC|A1-3|No\\.|\\d+|\\(|WS).*?"},
167       {"sp., ", ".*?"},
168       {"sp.", ".*?"},
169       {"sp.,", ".*?"},
170       {"spp. ", ".*?"},
171       {".) ", ".*?"},
172       {".)", ".*?"},
173       {".( ", ".*?"},
174       {".(", ".*?"},
175       {" var. ", ".*?"},
176       {"No. ", ".*?"},
177       {"bv. ", ".*?"},
178       { " pv. ", ".*?" }
179     };
180 
181     Pattern pattern = Pattern.compile(sentenceDelimiterPattern);
182     Matcher matcher = pattern.matcher(commentText);
183 
184     boolean concat = false;
185     StringBuffer sentence = null;
186     int end = 0;
187     boolean found = matcher.find();
188     while (found) {
189       String substring = matcher.group(1); // The sentence (probably).
190       end = matcher.end(1);                // End of the sentence.
191 
192       // The previous iteration extracted a substring of the sentence and needs to be extended by the current substring.
193       if (concat) {
194         sentence.append(substring);
195         concat = false;
196       } else {
197         sentence = new StringBuffer(substring);
198       }
199 
200       // Check if the current substring is not a sentence and needs to be extended by the next substring.
201         for (String[] nonSentenceDelimiter1 : nonSentenceDelimiters) {
202             String nonSentenceDelimiter = nonSentenceDelimiter1[0];
203             if (substring.endsWith(nonSentenceDelimiter)) {
204                 String restOfSentence = commentText.substring(matcher.end());
205                 if (Pattern.matches(nonSentenceDelimiter1[1], restOfSentence)) {
206                     concat = true;
207                     break;
208                 }
209             }
210         }
211 
212       found = matcher.find();                      // Any more sentences/substrings?
213       if (concat && found) continue;               // Concatenate the next substring.
214 
215       if (!concat)
216         sentences.add(sentence.toString().trim()); // Add sentence.
217       else
218         sentences.add(sentence.toString());        // Keep space for concatenation of the tail.
219     }
220 
221     if (concat) { // Concatenate the tail.
222       StringBuilder temp =
223             new StringBuilder(sentences.get(sentences.size() - 1));
224       temp.append(commentText.substring(end));
225       sentences.remove(sentences.size() - 1);
226       sentences.add(temp.toString().trim());
227     } else {
228       sentences.add(commentText.substring(end).trim());
229     }
230 
231     if (commentText.matches(".*?\\:\\s\\(\\d+\\)\\s+.*?")) sentences = mergeListSentences(sentences);
232 
233     return sentences;
234   }
235 
236   /**
237    * Merges sentences which belong to an ordered list.
238    *
239    * @param sentences All sentences identified in the given text.
240    * @return If the list of sentences contained an ordered list the returned list will have less elements.
241    */
242   private List<String> mergeListSentences(List<String> sentences) {
243     List<String> mergedSentences = new ArrayList<String>();
244 
245     int count = 0;
246     for (int iii = 0; iii < sentences.size(); iii++) {
247       String sentence = sentences.get(iii);
248       if (sentence.matches("^\\(\\d+\\).*?")) {
249         StringBuilder extendedSentence = new StringBuilder(
250                 mergedSentences.remove(iii - (1 + count))); // Previous sentence
251         extendedSentence.append(" ");
252         extendedSentence.append(sentence); // Current sentence.
253         mergedSentences.add(iii - (1 + count), extendedSentence.toString());
254         count++;
255       } else
256         mergedSentences.add(sentence);
257     }
258 
259     return mergedSentences;
260   }
261 
262   /**
263    * Calculates the indent to be added to subsequent lines.
264    *
265    * @param line The line containing the beginning of a list item (e.g. '(1)').
266    * @return the indent.
267    */
268   private String getIndent(String line) {
269     assert line != null : line;
270 
271     StringBuilder indent = new StringBuilder();
272     // Ex.: >(1) < -> 4 spaces.
273 //    for (int counter = 0, spaces = line.substring(line.indexOf("("), line.indexOf(")") + 2).length();
274 //         counter < spaces;
275 //         counter++)
276 //      indent.append(" ");
277 
278     return indent.toString();
279   }
280 
281 }