View Javadoc

1   /* CharsetSelfTest
2    *
3    * Created on Mar 10, 2004
4    *
5    * Copyright (C) 2004 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.selftest;
24  
25  import java.io.File;
26  import java.util.Arrays;
27  import java.util.List;
28  
29  /***
30   * Simple test to ensure we can extract links from multibyte pages.
31   *
32   * That is, can we regex over a multibyte stream.
33   *
34   * @author stack
35   * @version $Revision: 4931 $, $Date: 2007-02-21 18:48:17 +0000 (Wed, 21 Feb 2007) $
36   */
37  public class CharsetSelfTest extends SelfTestCase
38  {
39      /***
40       * Files to find as a list.
41       */
42      private static final List<File> FILES_TO_FIND =
43          Arrays.asList(new File[]
44              {new File("utf8.jsp"),
45                  new File("shiftjis.jsp"),
46                  new File("charsetselftest_end.html")});
47  
48      /***
49       * Look for last file in link chain.
50       *
51       * The way the pages are setup under the CharsetSelfTest directory under
52       * the webapp is that we have one multibyte page w/ a single link buried in
53       * it that points off to another multibyte page.  On the end of the link
54       * chain is a page named END_OF_CHAIN_PAGE.  This test looks to see that
55       * arc has all pages in the chain.
56       */
57      public void stestCharset()
58      {
59          assertInitialized();
60          testFilesInArc(FILES_TO_FIND);
61      }
62  }