View Javadoc

1   package uk.ac.ebi.intenz.tools.sib.writer;
2   
3   import java.util.Iterator;
4   import java.util.SortedSet;
5   import java.util.TreeSet;
6   import java.util.regex.Matcher;
7   import java.util.regex.Pattern;
8   
9   /**
10   * This class provides a factory method to retrieve the adequate <code>LineWrapper</code> implementation.
11   *
12   * @author Michael Darsow
13   * @version $Revision: 1.3 $ $Date: 2008/11/18 17:01:10 $
14   */
15  public class LineWrapperFactory {
16  
17    /**
18     * These regular expression patterns are used to identify potential line break positions within a string.
19     */
20    private static final String[] lineBreakPatterns = {
21      "\\=\\s",
22      "[^\\(\\sEC]\\s",
23      "\\-\\d+\\-[^\\>\\|\\)]",
24      "[\\]\\)]\\-[^\\>\\|\\)]",
25      "\\-\\w{1,1}\\-[^\\>\\|\\)]",
26      "\\-\\d(\\,\\d)+\\-[^\\>\\|\\)]",
27      "\\w{3,}\\-[^\\>\\|\\)]",
28      "\\,\\d+\\-[^\\>\\|\\)]"
29    };
30  
31  //    "\\-\\d+\\-[^\\>\\|]",
32  //    "\\]\\-[^\\>\\|]",
33  //    "\\-\\w{1,1}\\-[^\\>\\|]",
34  //    "\\-\\d(\\,\\d)+\\-[^\\>\\|]",
35  //    "\\w{3,}\\-[^\\>\\|]",
36  //    "\\,\\d+\\-[^\\>\\|]"
37    private static final String[] enforcedLineBreakPatterns = {
38      "\\:?((?<!and)\\s\\(\\d+\\)\\s+)(?!.*?and\\s\\(\\d+\\).*?).+?\\."
39    };
40  
41    /**
42     * Returns a <code>LineWrapper</code> implementation which will be chosen according to the needs of the text to be
43     * wrapped.
44     *
45     * @param wholeText The text to be wrapped.
46     * @param lineType  The {@link LineType LineType}
47     * @return a concrete <code>LineWrapper</code> implementation.
48     * @throws NullPointerException if any of the parameters is <code>null</code>.
49     */
50    public static LineWrapper create(String wholeText, LineType lineType) {
51      if (wholeText == null || lineType == null) throw new NullPointerException();
52  
53      if (!wholeText.matches(".*?[\\:\\.]\\s+?\\(\\d+\\)\\s+.+?")) {
54        if (lineType == LineType.CA && wholeText.matches(".+?\\s\\+\\s.+?\\s\\=\\s.+?\\s\\+\\s.+?")) {
55          // If the compounds consist of more than 3 words this reaction is very likely a descriptive reaction and
56          // therefore not affected by the rule contained in the ReactionWrapper.
57          Pattern pattern = Pattern.compile("(.+?)\\s\\+\\s(.+?)\\s\\=\\s(.+?)\\s\\+\\s(.+)");
58          Matcher matcher = pattern.matcher(wholeText);
59          if (matcher.find()) {
60            String compound1 = matcher.group(1);
61            String compound2 = matcher.group(2);
62            String compound3 = matcher.group(3);
63            String compound4 = matcher.group(4);
64            if (countWords(compound1) > 3 || countWords(compound2) > 3 ||
65                    countWords(compound3) > 3 || countWords(compound4) > 3)
66              return new DefaultLineWrapper();
67          }
68  
69          return new ReactionWrapper(); // Descriptive reactions are not included.
70        }
71        return new DefaultLineWrapper();
72      } else
73        return new OrderedListLineWrapper();
74    }
75  
76    private static class DefaultLineWrapper implements LineWrapper {
77      private static final int MIN_LENGTH_DIVISOR = 4;
78      private static final int MIN_LENGTH_NUMBER = 2;
79  
80      public int findPosition(String text, int netLineWidth) throws EnzymeFlatFileWriteException {
81        SortedSet possibleLineBreakPositions = getPossibleLineBreakPositions(text, netLineWidth);
82        if (possibleLineBreakPositions == null) throw new EnzymeFlatFileWriteException("No line break positions found");
83        return getNearestLineBreakPosition(possibleLineBreakPositions, netLineWidth, text);
84      }
85  
86      /**
87       * Tries to find as many potential line break positions in the given text as possible.
88       * <p/>
89       *
90       * @param text Text to be examined.
91       * @return a <code>TreeSet</code> containing all potential line break positions or <code>null</code> if no positions
92       *         could be found.
93       */
94      protected SortedSet getPossibleLineBreakPositions(String text, int lineWidth) {
95        assert text != null : text;
96        SortedSet possibleLineBreakPositions = null;
97  
98        Pattern pattern;
99        Matcher matcher;
100 
101       for (int iii = 0; iii < lineBreakPatterns.length; iii++) {
102         String lineBreakPattern = lineBreakPatterns[iii];
103 
104         pattern = Pattern.compile(lineBreakPattern);
105         matcher = pattern.matcher(text);
106         while (matcher.find()) {
107           int position = matcher.end();
108           // The space character is ignored for position determination, so is the '>' character.
109           if (lineBreakPattern.endsWith("\\s") || lineBreakPattern.endsWith("[^\\>\\|\\)]")) position--;
110           if (possibleLineBreakPositions == null){
111         	  possibleLineBreakPositions = new TreeSet();
112           }
113           possibleLineBreakPositions.add(new Integer(position));
114         }
115       }
116 
117 //      Iterator it = possibleLineBreakPositions.iterator();
118 //      while (it.hasNext()) {
119 //        System.out.println("it.next() = " + ((Integer) it.next()).intValue());
120 //      }
121 
122       return possibleLineBreakPositions;
123     }
124 
125     /**
126      * Returns the nearest line break position to the given maximum position.
127      *
128      * @param possibleLineBreakPositions All line break positions calculated.
129      * @param netLineWidth               The maximum position.
130      * @return the nearest line break position which is <= netLineWidth.
131      */
132     protected int getNearestLineBreakPosition(SortedSet possibleLineBreakPositions, int netLineWidth, String text) {
133       assert possibleLineBreakPositions != null : possibleLineBreakPositions;
134       assert netLineWidth > -1 : netLineWidth;
135 //      System.out.println("text = " + text);
136       int previousLineBreakPosition = 0;
137       int possibleLineBreakPosition = 0;
138       int lastSpace = 0;
139 
140       Iterator it = possibleLineBreakPositions.iterator();
141       while (it.hasNext()) {
142         int position = ((Integer) it.next()).intValue();
143         if (position <= netLineWidth) {
144           previousLineBreakPosition = possibleLineBreakPosition;
145           if (text.charAt(previousLineBreakPosition) == ' ') lastSpace = previousLineBreakPosition; // Save index of last space.
146           possibleLineBreakPosition = position;
147         }
148       }
149 
150       // Exceptions ...
151       if (previousLineBreakPosition > 0) {
152         // Enforce line wrapping after '=', '+', 'and' and 'or' if these strings appear in the end of the line.
153         if ((possibleLineBreakPosition - previousLineBreakPosition < 5 &&
154                 (text.charAt(previousLineBreakPosition - 1) == '=' ||
155                 text.charAt(previousLineBreakPosition - 1) == '+' ||
156                 text.charAt(previousLineBreakPosition - 1) == ',' //||
157                // text.substring(previousLineBreakPosition - 1, possibleLineBreakPosition).indexOf("and") > -1 ||
158                // text.substring(previousLineBreakPosition - 1, possibleLineBreakPosition).indexOf("or") > -1
159               ))) {
160           return previousLineBreakPosition;
161         }
162       }
163 
164       // Dashed words in the end are treated with special care ...
165 //      if (isDashedWordSpecialCase(possibleLineBreakPosition, lastSpace, text))
166 //        return lastSpace;
167        if(possibleLineBreakPosition==0)
168          if(possibleLineBreakPositions.first()!=null)
169             possibleLineBreakPosition = ((Integer) possibleLineBreakPositions.first()).intValue();
170 
171       return possibleLineBreakPosition;
172     }
173 
174 
175     private boolean isDashedWordSpecialCase(int index, int lastSpace, String text) {
176       String endOfLine = text.substring(lastSpace, index).trim();
177       String wrappedWord = "";
178       if (text.indexOf(" ", lastSpace + 1) > -1) {
179         wrappedWord = text.substring(lastSpace, text.indexOf(" ", lastSpace + 1)).trim();
180       } else {
181         wrappedWord = text.substring(lastSpace);
182       }
183 
184       // 1. If numbers (maybe concatenated by a comma) appear in the endOfLine string only, then wrap AFTER the dash
185       //    only if the length of endOfLine is > MIN_LENGTH_NUMBER; otherwise wrap BEFORE the dash.
186 //      if (endOfLine.matches("\\d+(?:\\,\\d+)*?\\-")) {
187 //        if (endOfLine.substring(0, endOfLine.indexOf("-")).length() > MIN_LENGTH_NUMBER)
188 //          return false;
189 //        return true;
190 //      }
191 
192       // 2. If only one dash exists in the wrapped word than always wrap AFTER the dash.
193       //    Otherwise if the substring is < a quarter of wrappedWord.length() wrap BEFORE the dash;
194       //    WRAP AFTER the dash if none of the above cases apply.
195       if (endOfLine.matches("\\w+?\\-")) {
196         if (wrappedWord.indexOf("-") == wrappedWord.lastIndexOf("-")) return false;
197         if (endOfLine.length() < (wrappedWord.length() / MIN_LENGTH_DIVISOR)) return true; // wrap before endOfLine begins
198         return false;
199       }
200 
201       return false;
202     }
203   }
204 
205   private static class OrderedListLineWrapper extends DefaultLineWrapper {
206     protected SortedSet getPossibleLineBreakPositions(String text, int lineWidth) {
207       SortedSet possibleLineBreakPositions = null;
208       Pattern pattern;
209       Matcher matcher;
210       for (int iii = 0; iii < enforcedLineBreakPatterns.length; iii++) {
211         if (iii == 0) possibleLineBreakPositions = new TreeSet();
212         String lineBreakPattern = enforcedLineBreakPatterns[iii];
213 
214         pattern = Pattern.compile(lineBreakPattern);
215         matcher = pattern.matcher(text);
216         if (matcher.find()) {
217           int position = text.indexOf(matcher.group(1)) + 1;
218           if (position > lineWidth) break;
219           possibleLineBreakPositions.add(new Integer(position));
220           return possibleLineBreakPositions;
221         }
222       }
223 
224       return super.getPossibleLineBreakPositions(text, lineWidth);
225     }
226   }
227 
228   private static class ReactionWrapper extends DefaultLineWrapper {
229     // 1.10.2.1
230     protected SortedSet getPossibleLineBreakPositions(String text, int lineWidth) {
231       SortedSet possibleLineBreakPositions = super.getPossibleLineBreakPositions(text, lineWidth);
232       SortedSet cleanedLineBreakPositions = new TreeSet();
233       for (Iterator it = possibleLineBreakPositions.iterator(); it.hasNext();) {
234         int position = ((Integer) it.next()).intValue();
235         if (text.charAt(position) == ' ') {
236           char before = text.charAt(position - 1);
237           char after = text.charAt(position + 1);
238 //          if (before != '+' && before != '=' && after != '+' && after != '=')
239           if (before != '+' && before != '=')
240             continue;
241         }
242         cleanedLineBreakPositions.add(new Integer(position));
243       }
244 
245       return cleanedLineBreakPositions;
246     }
247   }
248 
249   private static int countWords(String text) {
250     Pattern pattern = Pattern.compile("\\b");
251     Matcher matcher = pattern.matcher(text);
252     int count = 0;
253     while (matcher.find()) count++;
254     return count / 2;
255   }
256 
257 }