View Javadoc

1   /* IAURLCodec
2   *
3   * $Id: LaxURLCodec.java 4365 2006-07-18 00:40:16Z gojomo $
4   *
5   * Created on Jul 21, 2005
6   *
7   * Copyright (C) 2005 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */ 
25  package org.archive.net;
26  
27  import java.io.ByteArrayOutputStream;
28  import java.io.UnsupportedEncodingException;
29  import java.util.BitSet;
30  
31  import org.apache.commons.codec.net.URLCodec;
32  
33  /***
34   * @author gojomo
35   */
36  public class LaxURLCodec extends URLCodec {
37      public static LaxURLCodec DEFAULT = new LaxURLCodec("UTF-8");
38  
39      // passthrough constructor
40      public LaxURLCodec(String encoding) {
41          super(encoding);
42      }
43  
44      /***
45       * Decodes an array of URL safe 7-bit characters into an array of 
46       * original bytes. Escaped characters are converted back to their 
47       * original representation.
48       * 
49       * Differs from URLCodec.decodeUrl() in that it throws no 
50       * exceptions; bad or incomplete escape sequences are ignored
51       * and passed into result undecoded. This matches the behavior
52       * of browsers, which will use inconsistently-encoded URIs
53       * in HTTP request-lines. 
54       *
55       * @param bytes array of URL safe characters
56       * @return array of original bytes 
57       */
58      public static final byte[] decodeUrlLoose(byte[] bytes) 
59      {
60          if (bytes == null) {
61              return null;
62          }
63          ByteArrayOutputStream buffer = new ByteArrayOutputStream(); 
64          for (int i = 0; i < bytes.length; i++) {
65              int b = bytes[i];
66              if (b == '+') {
67                  buffer.write(' ');
68                  continue;
69              }
70              if (b == '%') {
71                  if(i+2<bytes.length) {
72                      int u = Character.digit((char)bytes[i+1], 16);
73                      int l = Character.digit((char)bytes[i+2], 16);
74                      if (u > -1 && l > -1) {
75                          // good encoding
76                          int c = ((u << 4) + l);
77                          buffer.write((char)c);
78                          i += 2;
79                          continue;
80                      } // else: bad encoding digits, leave '%' in place
81                  } // else: insufficient encoding digits, leave '%' in place
82              }
83              buffer.write(b);
84          }
85          return buffer.toByteArray(); 
86      }
87  
88      /***
89       * A more expansive set of ASCII URI characters to consider as 'safe' to
90       * leave unencoded, based on actual browser behavior.
91       */
92      public static BitSet EXPANDED_URI_SAFE = new BitSet(256);
93      static {
94          // alpha characters
95          for (int i = 'a'; i <= 'z'; i++) {
96              EXPANDED_URI_SAFE.set(i);
97          }
98          for (int i = 'A'; i <= 'Z'; i++) {
99              EXPANDED_URI_SAFE.set(i);
100         }
101         // numeric characters
102         for (int i = '0'; i <= '9'; i++) {
103             EXPANDED_URI_SAFE.set(i);
104         }
105         // special chars
106         EXPANDED_URI_SAFE.set('-');
107         EXPANDED_URI_SAFE.set('~');
108         EXPANDED_URI_SAFE.set('_');
109         EXPANDED_URI_SAFE.set('.');
110         EXPANDED_URI_SAFE.set('*');
111         EXPANDED_URI_SAFE.set('/');
112         EXPANDED_URI_SAFE.set('=');
113         EXPANDED_URI_SAFE.set('&');
114         EXPANDED_URI_SAFE.set('+');
115         EXPANDED_URI_SAFE.set(',');
116         EXPANDED_URI_SAFE.set(':');
117         EXPANDED_URI_SAFE.set(';');
118         EXPANDED_URI_SAFE.set('@');
119         EXPANDED_URI_SAFE.set('$');
120         EXPANDED_URI_SAFE.set('!');
121         EXPANDED_URI_SAFE.set(')');
122         EXPANDED_URI_SAFE.set('(');
123         // experiments indicate: Firefox (1.0.6) never escapes '%'
124         EXPANDED_URI_SAFE.set('%');
125         // experiments indicate: Firefox (1.0.6) does not escape '|' or '''
126         EXPANDED_URI_SAFE.set('|'); 
127         EXPANDED_URI_SAFE.set('\'');
128     }
129     
130     public static BitSet QUERY_SAFE = new BitSet(256);
131     static {
132         QUERY_SAFE.or(EXPANDED_URI_SAFE);
133         // Tests indicate Firefox (1.0.7-1) doesn't escape curlies in query str.
134         QUERY_SAFE.set('{');
135         QUERY_SAFE.set('}');
136         // nor any of these: [ ] ^ ? 
137         QUERY_SAFE.set('[');
138         QUERY_SAFE.set(']');
139         QUERY_SAFE.set('^');
140         QUERY_SAFE.set('?');
141     }
142     
143     /***
144      * Encodes a string into its URL safe form using the specified
145      * string charset. Unsafe characters are escaped.
146      * 
147      * This method is analogous to superclass encode() methods,
148      * additionally offering the ability to specify a different
149      * 'safe' character set (such as EXPANDED_URI_SAFE). 
150      * 
151      * @param safe BitSet of characters that don't need to be encoded
152      * @param pString String to encode
153      * @param cs Name of character set to use
154      * @return Encoded version of <code>pString</code>.
155      * @throws UnsupportedEncodingException
156      */
157     public String encode(BitSet safe, String pString, String cs)
158     throws UnsupportedEncodingException {
159         if (pString == null) {
160             return null;
161         }
162         return new String(encodeUrl(safe,pString.getBytes(cs)), "US-ASCII");
163     }
164 }