1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.net;
26
27 import java.io.ByteArrayOutputStream;
28 import java.io.UnsupportedEncodingException;
29 import java.util.BitSet;
30
31 import org.apache.commons.codec.net.URLCodec;
32
33 /***
34 * @author gojomo
35 */
36 public class LaxURLCodec extends URLCodec {
37 public static LaxURLCodec DEFAULT = new LaxURLCodec("UTF-8");
38
39
40 public LaxURLCodec(String encoding) {
41 super(encoding);
42 }
43
44 /***
45 * Decodes an array of URL safe 7-bit characters into an array of
46 * original bytes. Escaped characters are converted back to their
47 * original representation.
48 *
49 * Differs from URLCodec.decodeUrl() in that it throws no
50 * exceptions; bad or incomplete escape sequences are ignored
51 * and passed into result undecoded. This matches the behavior
52 * of browsers, which will use inconsistently-encoded URIs
53 * in HTTP request-lines.
54 *
55 * @param bytes array of URL safe characters
56 * @return array of original bytes
57 */
58 public static final byte[] decodeUrlLoose(byte[] bytes)
59 {
60 if (bytes == null) {
61 return null;
62 }
63 ByteArrayOutputStream buffer = new ByteArrayOutputStream();
64 for (int i = 0; i < bytes.length; i++) {
65 int b = bytes[i];
66 if (b == '+') {
67 buffer.write(' ');
68 continue;
69 }
70 if (b == '%') {
71 if(i+2<bytes.length) {
72 int u = Character.digit((char)bytes[i+1], 16);
73 int l = Character.digit((char)bytes[i+2], 16);
74 if (u > -1 && l > -1) {
75
76 int c = ((u << 4) + l);
77 buffer.write((char)c);
78 i += 2;
79 continue;
80 }
81 }
82 }
83 buffer.write(b);
84 }
85 return buffer.toByteArray();
86 }
87
88 /***
89 * A more expansive set of ASCII URI characters to consider as 'safe' to
90 * leave unencoded, based on actual browser behavior.
91 */
92 public static BitSet EXPANDED_URI_SAFE = new BitSet(256);
93 static {
94
95 for (int i = 'a'; i <= 'z'; i++) {
96 EXPANDED_URI_SAFE.set(i);
97 }
98 for (int i = 'A'; i <= 'Z'; i++) {
99 EXPANDED_URI_SAFE.set(i);
100 }
101
102 for (int i = '0'; i <= '9'; i++) {
103 EXPANDED_URI_SAFE.set(i);
104 }
105
106 EXPANDED_URI_SAFE.set('-');
107 EXPANDED_URI_SAFE.set('~');
108 EXPANDED_URI_SAFE.set('_');
109 EXPANDED_URI_SAFE.set('.');
110 EXPANDED_URI_SAFE.set('*');
111 EXPANDED_URI_SAFE.set('/');
112 EXPANDED_URI_SAFE.set('=');
113 EXPANDED_URI_SAFE.set('&');
114 EXPANDED_URI_SAFE.set('+');
115 EXPANDED_URI_SAFE.set(',');
116 EXPANDED_URI_SAFE.set(':');
117 EXPANDED_URI_SAFE.set(';');
118 EXPANDED_URI_SAFE.set('@');
119 EXPANDED_URI_SAFE.set('$');
120 EXPANDED_URI_SAFE.set('!');
121 EXPANDED_URI_SAFE.set(')');
122 EXPANDED_URI_SAFE.set('(');
123
124 EXPANDED_URI_SAFE.set('%');
125
126 EXPANDED_URI_SAFE.set('|');
127 EXPANDED_URI_SAFE.set('\'');
128 }
129
130 public static BitSet QUERY_SAFE = new BitSet(256);
131 static {
132 QUERY_SAFE.or(EXPANDED_URI_SAFE);
133
134 QUERY_SAFE.set('{');
135 QUERY_SAFE.set('}');
136
137 QUERY_SAFE.set('[');
138 QUERY_SAFE.set(']');
139 QUERY_SAFE.set('^');
140 QUERY_SAFE.set('?');
141 }
142
143 /***
144 * Encodes a string into its URL safe form using the specified
145 * string charset. Unsafe characters are escaped.
146 *
147 * This method is analogous to superclass encode() methods,
148 * additionally offering the ability to specify a different
149 * 'safe' character set (such as EXPANDED_URI_SAFE).
150 *
151 * @param safe BitSet of characters that don't need to be encoded
152 * @param pString String to encode
153 * @param cs Name of character set to use
154 * @return Encoded version of <code>pString</code>.
155 * @throws UnsupportedEncodingException
156 */
157 public String encode(BitSet safe, String pString, String cs)
158 throws UnsupportedEncodingException {
159 if (pString == null) {
160 return null;
161 }
162 return new String(encodeUrl(safe,pString.getBytes(cs)), "US-ASCII");
163 }
164 }