View Javadoc

1   /* BadURIsStopPageParsingSelfTest
2    *
3    * Created on Mar 10, 2004
4    *
5    * Copyright (C) 2004 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.selftest;
24  
25  import java.io.File;
26  import java.util.ArrayList;
27  import java.util.Arrays;
28  import java.util.Iterator;
29  import java.util.List;
30  
31  /***
32   * Selftest for figuring problems parsing URIs in a page.
33   * 
34   * @author stack
35   * @see <a 
36   * href="https://sourceforge.net/tracker/?func=detail&aid=788219&group_id=73833&atid=539099">[ 788219 ]
37   * URI Syntax Errors stop page parsing.</a>
38   * @version $Revision: 4931 $, $Date: 2007-02-21 18:48:17 +0000 (Wed, 21 Feb 2007) $
39   */
40  public class BadURIsStopPageParsingSelfTest extends SelfTestCase
41  {
42      /***
43       * Files to find as a list.
44       * 
45       * We don't find goodtwo.html because it has a BASE that is out
46       * of scope.
47       */
48      private static final List<File> FILES_TO_FIND =
49          Arrays.asList(new File[]
50              {new File("goodone.html"),
51                  new File("goodthree.html"),
52                  new File("one.html"),
53                  new File("two.html"),
54                  new File("three.html")});
55  
56      public void stestFilesFound() {
57          assertInitialized();
58          List<File> foundFiles = filesFoundInArc();
59          ArrayList<File> editedFoundFiles
60           = new ArrayList<File>(foundFiles.size());
61          for (Iterator i = foundFiles.iterator(); i.hasNext();) {
62              File f = (File)i.next();
63              if (f.getAbsolutePath().endsWith("polishex.html")) {
64                  // There is a URI in our list with the above as suffix.  Its in
65                  // the arc as a 404. Remove it.  It doesn't exist on disk so it
66                  // will cause the below testFilesInArc to fail.
67                  continue;
68              }
69              editedFoundFiles.add(f);
70          }
71          testFilesInArc(FILES_TO_FIND, editedFoundFiles);
72      }
73  }