View Javadoc

1   /* LineReadingIterator
2   *
3   * $Id: RegexpLineIterator.java 6910 2010-07-02 17:46:12Z gojomo $
4   *
5   * Created on Jul 27, 2004
6   *
7   * Copyright (C) 2004 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */ 
25  package org.archive.util.iterator;
26  
27  import java.util.Iterator;
28  import java.util.logging.Logger;
29  import java.util.regex.Matcher;
30  import java.util.regex.Pattern;
31  
32  /***
33   * Utility class providing an Iterator interface over line-oriented
34   * text input. By providing regexps indicating lines to ignore
35   * (such as pure whitespace or comments), lines to consider input, and
36   * what to return from the input lines (such as a whitespace-trimmed
37   * non-whitespace token with optional trailing comment), this can
38   * be configured to handle a number of formats. 
39   * 
40   * The public static members provide pattern configurations that will
41   * be helpful in a wide variety of contexts. 
42   * 
43   * @author gojomo
44   */
45  public class RegexpLineIterator 
46  extends TransformingIteratorWrapper<String,String> {
47      private static final Logger logger =
48          Logger.getLogger(RegexpLineIterator.class.getName());
49  
50      public static final String COMMENT_LINE = "//s*(#.*)?";
51      public static final String NONWHITESPACE_ENTRY_TRAILING_COMMENT = 
52          "^[//s\ufeff]*(//S+)//s*(#.*)?$";
53      public static final String TRIMMED_ENTRY_TRAILING_COMMENT = 
54          "^//s*([^#]+?)//s*(#.*)?$";
55  
56      public static final String ENTRY = "$1";
57  
58      protected Matcher ignoreLine = null;
59      protected Matcher extractLine = null;
60      protected String outputTemplate = null;
61  
62  
63      public RegexpLineIterator(Iterator<String> inner, String ignore, 
64              String extract, String replace) {
65          this.inner = inner;
66          ignoreLine = Pattern.compile(ignore).matcher("");
67          extractLine = Pattern.compile(extract).matcher("");
68          outputTemplate = replace;
69      }
70  
71      /***
72       * Loads next item into lookahead spot, if available. Skips
73       * lines matching ignoreLine; extracts desired portion of
74       * lines matching extractLine; informationally reports any
75       * lines matching neither. 
76       * 
77       * @return whether any item was loaded into next field
78       */
79      protected String transform(String line) {
80          ignoreLine.reset(line);
81          if(ignoreLine.matches()) {
82              return null; 
83          }
84          extractLine.reset(line);
85          if(extractLine.matches()) {
86              StringBuffer output = new StringBuffer();
87              // TODO: consider if a loop that find()s all is more 
88              // generally useful here
89              extractLine.appendReplacement(output,outputTemplate);
90              return output.toString();
91          }
92          // no match; possibly error
93          logger.warning("line not extracted nor no-op: "+line);
94          return null;
95      }
96  }