1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 package org.archive.net;
27
28 import java.util.Iterator;
29 import java.util.TreeMap;
30
31 import junit.framework.TestCase;
32
33 import org.apache.commons.httpclient.URIException;
34
35 /***
36 * Test UURIFactory for proper UURI creation across variety of
37 * important/tricky cases.
38 *
39 * Be careful writing this file. Make sure you write it with UTF-8 encoding.
40 *
41 * @author igor stack gojomo
42 */
43 public class UURIFactoryTest extends TestCase {
44
45 public final void testEscaping() throws URIException {
46
47 final String ESCAPED_URISTR = "http://archive.org/" +
48 UURIFactory.ESCAPED_SPACE +
49 UURIFactory.ESCAPED_SPACE +
50 UURIFactory.ESCAPED_CIRCUMFLEX +
51 UURIFactory.ESCAPED_QUOT +
52 UURIFactory.SQUOT +
53 UURIFactory.ESCAPED_APOSTROPH +
54 UURIFactory.ESCAPED_LSQRBRACKET +
55 UURIFactory.ESCAPED_RSQRBRACKET +
56 UURIFactory.ESCAPED_LCURBRACKET +
57 UURIFactory.ESCAPED_RCURBRACKET +
58 UURIFactory.SLASH + "a.gif";
59
60 final String URISTR = "http://archive.org/.././" + "\u00A0" +
61 UURIFactory.SPACE + UURIFactory.CIRCUMFLEX +
62 UURIFactory.QUOT + UURIFactory.SQUOT +
63 UURIFactory.APOSTROPH + UURIFactory.LSQRBRACKET +
64 UURIFactory.RSQRBRACKET + UURIFactory.LCURBRACKET +
65 UURIFactory.RCURBRACKET + UURIFactory.BACKSLASH +
66 "test/../a.gif" + "\u00A0" + UURIFactory.SPACE;
67
68 UURI uuri = UURIFactory.getInstance(URISTR);
69 final String uuriStr = uuri.toString();
70 assertEquals("expected escaping", ESCAPED_URISTR, uuriStr);
71 }
72
73 public final void testUnderscoreMakesPortParseFail() throws URIException {
74 UURI uuri = UURIFactory.getInstance("http://one-two_three:8080/index.html");
75 int port = uuri.getPort();
76 assertTrue("Failed find of port " + uuri, port == 8080);
77 }
78
79 public final void testRelativeURIWithTwoSlashes() throws URIException {
80 UURI base = UURIFactory.getInstance("http://www.archive.org");
81 UURI uuri = UURIFactory.getInstance(base, "one//index.html");
82 assertTrue("Doesn't do right thing with two slashes " + uuri,
83 uuri.toString().equals(
84 "http://www.archive.org/one//index.html"));
85 }
86
87 public final void testTrailingEncodedSpace() throws URIException {
88 UURI uuri = UURIFactory.getInstance("http://www.nps-shoes.co.uk%20");
89 assertTrue("Doesn't strip trailing encoded space 1 " + uuri,
90 uuri.toString().equals("http://www.nps-shoes.co.uk/"));
91 uuri = UURIFactory.getInstance("http://www.nps-shoes.co.uk%20%20%20");
92 assertTrue("Doesn't strip trailing encoded space 2 " + uuri,
93 uuri.toString().equals("http://www.nps-shoes.co.uk/"));
94 }
95
96 public final void testPort0080is80() throws URIException {
97 UURI uuri = UURIFactory.getInstance("http://archive.org:0080");
98 assertTrue("Doesn't strip leading zeros " + uuri,
99 uuri.toString().equals("http://archive.org/"));
100 }
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126 public final void testEscapeEncoding() throws URIException {
127 UURI uuri = UURIFactory.getInstance("http://www.y1y1.com/" +
128 "albums/userpics/11111/normal_%E3%E4%EC%EC%EC.jpg", "windows-1256");
129 uuri.getPath();
130 }
131
132 public final void testTooLongAfterEscaping() {
133 StringBuffer buffer = new StringBuffer("http://www.archive.org/a/");
134
135 for (int i = 0; i < 1024; i++) {
136 buffer.append(" ");
137 }
138 buffer.append("/index.html");
139 String message = null;
140 try {
141 UURIFactory.getInstance(buffer.toString());
142 } catch (URIException e) {
143 message = e.getMessage();
144 }
145 assertTrue("Wrong or no exception: " + message, (message != null) &&
146 message.startsWith("Created (escaped) uuri >"));
147 }
148
149 public final void testFtpUris() throws URIException {
150 final String FTP = "ftp";
151 final String AUTHORITY = "pfbuser:pfbuser@mprsrv.agri.gov.cn";
152 final String PATH = "/clzreceive/";
153 final String uri = FTP + "://" + AUTHORITY + PATH;
154 UURI uuri = UURIFactory.getInstance(uri);
155 assertTrue("Failed to get matching scheme: " + uuri.getScheme(),
156 (uuri.getScheme()).equals(FTP));
157 assertTrue("Failed to get matching authority: " +
158 uuri.getAuthority(), (uuri.getAuthority()).equals(AUTHORITY));
159 assertTrue("Failed to get matching path: " +
160 uuri.getPath(), (uuri.getPath()).equals(PATH));
161 }
162
163 public final void testWhitespaceEscaped() throws URIException {
164
165
166 String uri = "http://archive.org/index%25 .html";
167 String tgtUri = "http://archive.org/index%25%20.html";
168 UURI uuri = UURIFactory.getInstance(uri);
169 assertTrue("Not equal " + uuri.toString(),
170 uuri.toString().equals(tgtUri));
171 uri = "http://archive.org/index%25\u001D.html";
172 tgtUri = "http://archive.org/index%25%1D.html".toLowerCase();
173 uuri = UURIFactory.getInstance(uri);
174 assertEquals("whitespace escaping", tgtUri, uuri.toString());
175 uri = "http://gemini.info.usaid.gov/directory/" +
176 "pbResults.cfm?&urlNameLast=Rumplestiltskin";
177 tgtUri = "http://gemini.info.usaid.gov/directory/faxResults.cfm?" +
178 "name=Ebenezer%20+Rumplestiltskin,&location=RRB%20%20%20%205%2E08%2D006";
179 uuri = UURIFactory.getInstance(UURIFactory.getInstance(uri),
180 "faxResults.cfm?name=Ebenezer +Rumplestiltskin,&location=" +
181 "RRB%20%20%20%205%2E08%2D006");
182 assertEquals("whitespace escaping", tgtUri, uuri.toString());
183 }
184
185
186
187
188
189
190
191
192
193
194
195
196 public final void testDnsHost() throws URIException {
197 String uri = "dns://ads.nandomedia.com:81/one.html";
198 UURI uuri = UURIFactory.getInstance(uri);
199 String host = uuri.getReferencedHost();
200 assertTrue("Host is wrong " + host, host.equals("ads.nandomedia.com"));
201 uri = "dns:ads.nandomedia.com";
202 uuri = UURIFactory.getInstance(uri);
203 host = uuri.getReferencedHost();
204 assertTrue("Host is wrong " + host, host.equals("ads.nandomedia.com"));
205 uri = "dns:ads.nandomedia.com?a=b";
206 uuri = UURIFactory.getInstance(uri);
207 host = uuri.getReferencedHost();
208 assertTrue("Host is wrong " + host, host.equals("ads.nandomedia.com"));
209 }
210
211 public final void testPercentEscaping() throws URIException {
212 final String uri = "http://archive.org/%a%%%%%.html";
213
214 final String tgtUri = "http://archive.org/%a%%%%%.html";
215 UURI uuri = UURIFactory.getInstance(uri);
216 assertEquals("Not equal",tgtUri, uuri.toString());
217 }
218
219 public final void testRelativeDblPathSlashes() throws URIException {
220 UURI base = UURIFactory.getInstance("http://www.archive.org/index.html");
221 UURI uuri = UURIFactory.getInstance(base, "JIGOU//KYC//INDEX.HTM");
222 assertTrue("Double slash not working " + uuri.toString(),
223 uuri.getPath().equals("/JIGOU//KYC//INDEX.HTM"));
224 }
225
226 public final void testRelativeWithScheme() throws URIException {
227 UURI base = UURIFactory.getInstance("http://www.example.com/some/page");
228 UURI uuri = UURIFactory.getInstance(base, "http:boo");
229 assertTrue("Relative with scheme not working " + uuri.toString(),
230 uuri.toString().equals("http://www.example.com/some/boo"));
231 }
232
233 public final void testBadBaseResolve() throws URIException {
234 UURI base = UURIFactory.getInstance("http://license.joins.com/board/" +
235 "etc_board_list.asp?board_name=new_main&b_type=&nPage=" +
236 "2&category=G&lic_id=70&site=changeup&g_page=changeup&g_sPage=" +
237 "notice&gate=02");
238 UURIFactory.getInstance(base, "http://www.changeup.com/...</a");
239 }
240
241 public final void testTilde() throws URIException {
242 noChangeExpected("http://license.joins.com/~igor");
243 }
244
245 public final void testCurlies() throws URIException {
246
247
248
249 UURI uuri =
250 noChangeExpected("http://license.joins.com/igor?one={curly}");
251 assertEquals(uuri.getQuery(), "one={curly}");
252 assertEquals(UURIFactory.
253 getInstance("http://license.joins.com/igor{curly}.html").
254 toString(),
255 "http://license.joins.com/igor%7Bcurly%7D.html");
256 boolean exception = false;
257 try {
258 UURIFactory.getInstance("http://license.{curly}.com/igor.html");
259 } catch (URIException u) {
260 exception = true;
261 }
262 assertTrue("Did not get exception.", exception);
263 }
264
265 protected UURI noChangeExpected(final String original)
266 throws URIException {
267 UURI uuri = UURIFactory.getInstance(original);
268 assertEquals(original, uuri.toString());
269 return uuri;
270 }
271
272 public final void testTrimSpaceNBSP() throws URIException {
273 final String uri = " http://archive.org/DIR WITH SPACES/" +
274 UURIFactory.NBSP + "home.html " + UURIFactory.NBSP + " ";
275 final String tgtUri =
276 "http://archive.org/DIR%20WITH%20SPACES/%20home.html";
277 UURI uuri = UURIFactory.getInstance(uri);
278 assertTrue("Not equal " + uuri.toString(),
279 uuri.toString().equals(tgtUri));
280 }
281
282 /***
283 * Test space plus encoding ([ 1010966 ] crawl.log has URIs with spaces in them).
284 * See <a href="http://sourceforge.net/tracker/index.php?func=detail&aid=1010966&group_id=73833&atid=539099">[ 1010966 ] crawl.log has URIs with spaces in them</a>.
285 * @throws URIException
286 */
287 public final void testSpaceDoubleEncoding() throws URIException {
288 final String uri = "http://www.brook.edu/i.html? %20taxonomy=Politics";
289 final String encodedUri =
290 "http://www.brook.edu/i.html?%20%20taxonomy=Politics";
291 UURI uuri = UURIFactory.getInstance(uri, "ISO-8859-1");
292 assertTrue("Not equal " + uuri.toString(),
293 uuri.toString().equals(encodedUri));
294 }
295
296 /***
297 * Test for doubly-encoded sequences.
298 * See <a href="https://sourceforge.net/tracker/index.php?func=detail&aid=966219&group_id=73833&atid=539099">[ 966219 ] UURI doubly-encodes %XX sequences</a>.
299 * @throws URIException
300 */
301 public final void testDoubleEncoding() throws URIException {
302 final char ae = '\u00E6';
303 final String uri = "http://archive.org/DIR WITH SPACES/home" +
304 ae + ".html";
305 final String encodedUri =
306 "http://archive.org/DIR%20WITH%20SPACES/home%E6.html";
307 UURI uuri = UURIFactory.getInstance(uri, "ISO-8859-1");
308 assertEquals("single encoding", encodedUri, uuri.toString());
309
310 uuri = UURIFactory.getInstance(uuri.toString(), "ISO-8859-1");
311 uuri = UURIFactory.getInstance(uuri.toString(), "ISO-8859-1");
312 assertEquals("double encoding", encodedUri, uuri.toString());
313
314 uuri = UURIFactory.getInstance(uri);
315 final String encodedUtf8Uri =
316 "http://archive.org/DIR%20WITH%20SPACES/home%C3%A6.html";
317 assertEquals("Not equal utf8", encodedUtf8Uri, uuri.toString());
318
319 uuri = UURIFactory.getInstance(uuri.toString());
320 uuri = UURIFactory.getInstance(uuri.toString());
321 assertEquals("Not equal (dbl-encoding) utf8", encodedUtf8Uri, uuri.toString());
322 }
323
324 /***
325 * Test for syntax errors stop page parsing.
326 * @see <a href="https://sourceforge.net/tracker/?func=detail&aid=788219&group_id=73833&atid=539099">[ 788219 ] URI Syntax Errors stop page parsing</a>
327 * @throws URIException
328 */
329 public final void testThreeSlashes() throws URIException {
330 UURI goodURI = UURIFactory.
331 getInstance("http://lcweb.loc.gov/rr/goodtwo.html");
332 String uuri = "http:///lcweb.loc.gov/rr/goodtwo.html";
333 UURI rewrittenURI = UURIFactory.getInstance(uuri);
334 assertTrue("Not equal " + goodURI + ", " + uuri,
335 goodURI.toString().equals(rewrittenURI.toString()));
336 uuri = "http:////lcweb.loc.gov/rr/goodtwo.html";
337 rewrittenURI = UURIFactory.getInstance(uuri);
338 assertTrue("Not equal " + goodURI + ", " + uuri,
339 goodURI.toString().equals(rewrittenURI.toString()));
340
341 goodURI = UURIFactory.
342 getInstance("https://lcweb.loc.gov/rr/goodtwo.html");
343 uuri = "https:////lcweb.loc.gov/rr/goodtwo.html";
344 rewrittenURI = UURIFactory.getInstance(uuri);
345 assertTrue("Not equal " + goodURI + ", " + uuri,
346 goodURI.toString().equals(rewrittenURI.toString()));
347 }
348
349 public final void testNoScheme() {
350 boolean expectedException = false;
351 String uuri = "www.loc.gov/rr/european/egw/polishex.html";
352 try {
353 UURIFactory.getInstance(uuri);
354 } catch (URIException e) {
355
356 expectedException = true;
357 }
358 assertTrue("Didn't get expected exception: " + uuri,
359 expectedException);
360 }
361
362 public final void testRelative() throws URIException {
363 UURI uuriTgt = UURIFactory.
364 getInstance("http://archive.org:83/home.html");
365 UURI uri = UURIFactory.
366 getInstance("http://archive.org:83/one/two/three.html");
367 UURI uuri = UURIFactory.
368 getInstance(uri, "/home.html");
369 assertTrue("Not equal",
370 uuriTgt.toString().equals(uuri.toString()));
371 }
372
373 /***
374 * Test that an empty uuri does the right thing -- that we get back the
375 * base.
376 *
377 * @throws URIException
378 */
379 public final void testRelativeEmpty() throws URIException {
380 UURI uuriTgt = UURIFactory.
381 getInstance("http://archive.org:83/one/two/three.html");
382 UURI uri = UURIFactory.
383 getInstance("http://archive.org:83/one/two/three.html");
384 UURI uuri = UURIFactory.
385 getInstance(uri, "");
386 assertTrue("Empty length don't work",
387 uuriTgt.toString().equals(uuri.toString()));
388 }
389
390 public final void testAbsolute() throws URIException {
391 UURI uuriTgt = UURIFactory.
392 getInstance("http://archive.org:83/home.html");
393 UURI uri = UURIFactory.
394 getInstance("http://archive.org:83/one/two/three.html");
395 UURI uuri = UURIFactory.
396 getInstance(uri, "http://archive.org:83/home.html");
397 assertTrue("Not equal",
398 uuriTgt.toString().equals(uuri.toString()));
399 }
400
401 /***
402 * Test for [ 962892 ] UURI accepting/creating unUsable URIs (bad hosts).
403 * @see <a href="https://sourceforge.net/tracker/?func=detail&atid=539099&aid=962892&group_id=73833">[ 962892 ] UURI accepting/creating unUsable URIs (bad hosts)</a>
404 */
405 public final void testHostWithLessThan() {
406 checkExceptionOnIllegalDomainlabel("http://www.betamobile.com</A");
407 checkExceptionOnIllegalDomainlabel(
408 "http://C|/unzipped/426/spacer.gif");
409 checkExceptionOnIllegalDomainlabel("http://www.lycos.co.uk\"/l/b/\"");
410 }
411
412 /***
413 * Test for [ 1012520 ] UURI.length() > 2k.
414 * @throws URIException
415 * @see <a href="http://sourceforge.net/tracker/index.php?func=detail&aid=1012520&group_id=73833&atid=539099">[ 1012520 ] UURI.length() > 2k</a>
416 */
417 public final void test2kURI() throws URIException {
418 final StringBuffer buffer = new StringBuffer("http://a.b");
419 final String subPath = "/123456789";
420 for (int i = 0; i < 207; i++) {
421 buffer.append(subPath);
422 }
423
424 UURIFactory.getInstance(buffer.toString());
425 boolean gotException = false;
426
427 buffer.append(subPath);
428 try {
429 UURIFactory.getInstance(buffer.toString());
430 } catch (URIException e) {
431 gotException = true;
432 }
433 assertTrue("No expected exception complaining about long URI",
434 gotException);
435 }
436
437 private void checkExceptionOnIllegalDomainlabel(String uuri) {
438 boolean expectedException = false;
439 try {
440 UURIFactory.getInstance(uuri);
441 } catch (URIException e) {
442
443 expectedException = true;
444 }
445 assertTrue("Didn't get expected exception: " + uuri,
446 expectedException);
447 }
448
449 /***
450 * Test for doing separate DNS lookup for same host
451 *
452 * @see <a href="https://sourceforge.net/tracker/?func=detail&aid=788277&group_id=73833&atid=539099">[ 788277 ] Doing separate DNS lookup for same host</a>
453 * @throws URIException
454 */
455 public final void testHostWithPeriod() throws URIException {
456 UURI uuri1 = UURIFactory.
457 getInstance("http://www.loc.gov./index.html");
458 UURI uuri2 = UURIFactory.
459 getInstance("http://www.loc.gov/index.html");
460 assertEquals("Failed equating hosts with dot",
461 uuri1.getHost(), uuri2.getHost());
462 }
463
464 /***
465 * Test for NPE in java.net.URI.encode
466 *
467 * @see <a href="https://sourceforge.net/tracker/?func=detail&aid=874220&group_id=73833&atid=539099">[ 874220 ] NPE in java.net.URI.encode</a>
468 * @throws URIException
469 */
470 public final void testHostEncodedChars() throws URIException {
471 String s = "http://g.msn.co.kr/0nwkokr0/00/19??" +
472 "PS=10274&NC=10009&CE=42&CP=949&HL=" +
473 "���?��";
474 assertNotNull("Encoded chars " + s,
475 UURIFactory.getInstance(s));
476 }
477
478 /***
479 * Test for java.net.URI parses %20 but getHost null
480 *
481 * See <a href="https://sourceforge.net/tracker/?func=detail&aid=927940&group_id=73833&atid=539099">[ 927940 ] java.net.URI parses %20 but getHost null</a>
482 */
483 public final void testSpaceInHost() {
484 boolean expectedException = false;
485 try {
486 UURIFactory.getInstance(
487 "http://www.local-regions.odpm%20.gov.uk" +
488 "/lpsa/challenge/pdf/propect.pdf");
489 } catch (URIException e) {
490 expectedException = true;
491 }
492 assertTrue("Did not fail with escaped space.", expectedException);
493
494 expectedException = false;
495 try {
496 UURIFactory.getInstance(
497 "http://www.local-regions.odpm .gov.uk" +
498 "/lpsa/challenge/pdf/propect.pdf");
499 } catch (URIException e) {
500 expectedException = true;
501 }
502 assertTrue("Did not fail with real space.", expectedException);
503 }
504
505 /***
506 * Test for java.net.URI chokes on hosts_with_underscores.
507 *
508 * @see <a href="https://sourceforge.net/tracker/?func=detail&aid=808270&group_id=73833&atid=539099">[ 808270 ] java.net.URI chokes on hosts_with_underscores</a>
509 * @throws URIException
510 */
511 public final void testHostWithUnderscores() throws URIException {
512 UURI uuri = UURIFactory.getInstance(
513 "http://x_underscore_underscore.2u.com.tw/nonexistent_page.html");
514 assertEquals("Failed get of host with underscore",
515 "x_underscore_underscore.2u.com.tw", uuri.getHost());
516 }
517
518
519 /***
520 * Two dots for igor.
521 */
522 public final void testTwoDots() {
523 boolean expectedException = false;
524 try {
525 UURIFactory.getInstance(
526 "http://x_underscore_underscore..2u.com/nonexistent_page.html");
527 } catch (URIException e) {
528 expectedException = true;
529 }
530 assertTrue("Two dots did not throw exception", expectedException);
531 }
532
533 /***
534 * Test for java.net.URI#getHost fails when leading digit.
535 *
536 * @see <a href="https://sourceforge.net/tracker/?func=detail&aid=910120&group_id=73833&atid=539099">[ 910120 ] java.net.URI#getHost fails when leading digit.</a>
537 * @throws URIException
538 */
539 public final void testHostWithDigit() throws URIException {
540 UURI uuri = UURIFactory.
541 getInstance("http://0204chat.2u.com.tw/nonexistent_page.html");
542 assertEquals("Failed get of host with digit",
543 "0204chat.2u.com.tw", uuri.getHost());
544 }
545
546 /***
547 * Test for Constraining java URI class.
548 *
549 * @see <a href="https://sourceforge.net/tracker/?func=detail&aid=949548&group_id=73833&atid=539099">[ 949548 ] Constraining java URI class</a>
550 */
551 public final void testPort() {
552 checkBadPort("http://www.tyopaikat.com:a/robots.txt");
553 checkBadPort("http://158.144.21.3:80808/robots.txt");
554 checkBadPort("http://pdb.rutgers.edu:81.rutgers.edu/robots.txt");
555 checkBadPort(
556 "https://webmail.gse.harvard.edu:9100robots.txt/robots.txt");
557 checkBadPort(
558 "https://webmail.gse.harvard.edu:0/robots.txt/robots.txt");
559 }
560
561 /***
562 * Test bad port throws exception.
563 * @param uri URI with bad port to check.
564 */
565 private void checkBadPort(String uri) {
566 boolean exception = false;
567 try {
568 UURIFactory.getInstance(uri);
569 }
570 catch (URIException e) {
571 exception = true;
572 }
573 assertTrue("Didn't throw exception: " + uri, exception);
574 }
575
576 /***
577 * Preserve userinfo capitalization.
578 * @throws URIException
579 */
580 public final void testUserinfo() throws URIException {
581 final String authority = "stack:StAcK@www.tyopaikat.com";
582 final String uri = "http://" + authority + "/robots.txt";
583 UURI uuri = UURIFactory.getInstance(uri);
584 assertEquals("Authority not equal", uuri.getAuthority(),
585 authority);
586
587
588
589
590 }
591
592 /***
593 * Test user info + port
594 * @throws URIException
595 */
596 public final void testUserinfoPlusPort() throws URIException {
597 final String userInfo = "stack:StAcK";
598 final String authority = "www.tyopaikat.com";
599 final int port = 8080;
600 final String uri = "http://" + userInfo + "@" + authority + ":" + port
601 + "/robots.txt";
602 UURI uuri = UURIFactory.getInstance(uri);
603 assertEquals("Host not equal", authority,uuri.getHost());
604 assertEquals("Userinfo Not equal",userInfo,uuri.getUserinfo());
605 assertEquals("Port not equal",port,uuri.getPort());
606 assertEquals("Authority wrong","stack:StAcK@www.tyopaikat.com:8080",
607 uuri.getAuthority());
608 assertEquals("AuthorityMinusUserinfo wrong","www.tyopaikat.com:8080",
609 uuri.getAuthorityMinusUserinfo());
610
611 }
612
613 public final void testRFC3986RelativeChange() throws URIException {
614 UURI base = UURIFactory.getInstance("http://a/b/c/d;p?q");
615 tryRelative(base, "?y", "http://a/b/c/d;p?y");
616 }
617
618 /***
619 * Tests from rfc3986
620 *
621 * <pre>
622 * "g:h" = "g:h"
623 * "g" = "http://a/b/c/g"
624 * "./g" = "http://a/b/c/g"
625 * "g/" = "http://a/b/c/g/"
626 * "/g" = "http://a/g"
627 * "//g" = "http://g"
628 * "?y" = "http://a/b/c/d;p?y"
629 * "g?y" = "http://a/b/c/g?y"
630 * "#s" = "http://a/b/c/d;p?q#s"
631 * "g#s" = "http://a/b/c/g#s"
632 * "g?y#s" = "http://a/b/c/g?y#s"
633 * ";x" = "http://a/b/c/;x"
634 * "g;x" = "http://a/b/c/g;x"
635 * "g;x?y#s" = "http://a/b/c/g;x?y#s"
636 * "" = "http://a/b/c/d;p?q"
637 * "." = "http://a/b/c/"
638 * "./" = "http://a/b/c/"
639 * ".." = "http://a/b/"
640 * "../" = "http://a/b/"
641 * "../g" = "http://a/b/g"
642 * "../.." = "http://a/"
643 * "../../" = "http://a/"
644 * "../../g" = "http://a/g"
645 * </pre>
646 *
647 * @throws URIException
648 */
649 public final void testRFC3986Relative() throws URIException {
650 UURI base = UURIFactory.getInstance("http://a/b/c/d;p?q");
651 tryRelative(base, "g:h", "g:h");
652 tryRelative(base, "g", "http://a/b/c/g");
653 tryRelative(base, "./g", "http://a/b/c/g");
654 tryRelative(base, "g/", "http://a/b/c/g/");
655 tryRelative(base, "/g", "http://a/g");
656 tryRelative(base, "//g", "http://g");
657 tryRelative(base, "?y", "http://a/b/c/d;p?y");
658 tryRelative(base, "g?y", "http://a/b/c/g?y");
659 tryRelative(base, "#s", "http://a/b/c/d;p?q#s");
660 tryRelative(base, "g#s", "http://a/b/c/g#s");
661 tryRelative(base, "g?y#s", "http://a/b/c/g?y#s");
662 tryRelative(base, ";x", "http://a/b/c/;x");
663 tryRelative(base, "g;x", "http://a/b/c/g;x");
664 tryRelative(base, "g;x?y#s","http://a/b/c/g;x?y#s");
665 tryRelative(base, "", "http://a/b/c/d;p?q");
666 tryRelative(base, ".", "http://a/b/c/");
667 tryRelative(base, "./", "http://a/b/c/");
668 tryRelative(base, "..", "http://a/b/");
669 tryRelative(base, "../", "http://a/b/");
670 tryRelative(base, "../g", "http://a/b/g");
671 tryRelative(base, "../..", "http://a/");
672 tryRelative(base, "../../", "http://a/");
673 tryRelative(base, "../../g","http://a/g");
674 }
675
676 protected void tryRelative(UURI base, String relative, String expected)
677 throws URIException {
678 UURI uuri = UURIFactory.getInstance(base, relative);
679 assertEquals("Derelativized " + relative + " gave "
680 + uuri + " not " + expected,
681 uuri,UURIFactory.getInstance(expected));
682 }
683
684 /***
685 * Tests from rfc2396 with amendments to accomodate differences
686 * intentionally added to make our URI handling like IEs.
687 *
688 * <pre>
689 * g:h = g:h
690 * g = http://a/b/c/g
691 * ./g = http://a/b/c/g
692 * g/ = http://a/b/c/g/
693 * /g = http://a/g
694 * //g = http://g
695 * ?y = http://a/b/c/?y
696 * g?y = http://a/b/c/g?y
697 * #s = (current document)#s
698 * g#s = http://a/b/c/g#s
699 * g?y#s = http://a/b/c/g?y#s
700 * ;x = http://a/b/c/;x
701 * g;x = http://a/b/c/g;x
702 * g;x?y#s = http://a/b/c/g;x?y#s
703 * . = http://a/b/c/
704 * ./ = http://a/b/c/
705 * .. = http://a/b/
706 * ../ = http://a/b/
707 * ../g = http://a/b/g
708 * ../.. = http://a/
709 * ../../ = http://a/
710 * ../../g = http://a/g
711 * </pre>
712 *
713 * @throws URIException
714 */
715 public final void testRFC2396Relative() throws URIException {
716 UURI base = UURIFactory.
717 getInstance("http://a/b/c/d;p?q");
718 TreeMap<String,String> m = new TreeMap<String,String>();
719 m.put("..", "http://a/b/");
720 m.put("../", "http://a/b/");
721 m.put("../g", "http://a/b/g");
722 m.put("../..", "http://a/");
723 m.put("../../", "http://a/");
724 m.put("../../g", "http://a/g");
725 m.put("g#s", "http://a/b/c/g#s");
726 m.put("g?y#s ", "http://a/b/c/g?y#s");
727 m.put(";x", "http://a/b/c/;x");
728 m.put("g;x", "http://a/b/c/g;x");
729 m.put("g;x?y#s", "http://a/b/c/g;x?y#s");
730 m.put(".", "http://a/b/c/");
731 m.put("./", "http://a/b/c/");
732 m.put("g", "http://a/b/c/g");
733 m.put("./g", "http://a/b/c/g");
734 m.put("g/", "http://a/b/c/g/");
735 m.put("/g", "http://a/g");
736 m.put("//g", "http://g");
737
738
739 m.put("g?y", "http://a/b/c/g?y");
740
741
742
743 m.put("/../../../../../../../../g", "http://a/g");
744 m.put("../../../../../../../../g", "http://a/g");
745 m.put("../G", "http://a/b/G");
746 for (Iterator i = m.keySet().iterator(); i.hasNext();) {
747 String key = (String)i.next();
748 String value = (String)m.get(key);
749 UURI uuri = UURIFactory.getInstance(base, key);
750 assertTrue("Unexpected " + key + " " + value + " " + uuri,
751 uuri.equals(UURIFactory.getInstance(value)));
752 }
753 }
754
755 /***
756 * A UURI should always be without a 'fragment' segment, which is
757 * unused and irrelevant for network fetches.
758 *
759 * See [ 970666 ] #anchor links not trimmed, and thus recrawled
760 *
761 * @throws URIException
762 */
763 public final void testAnchors() throws URIException {
764 UURI uuri = UURIFactory.
765 getInstance("http://www.example.com/path?query#anchor");
766 assertEquals("Not equal", "http://www.example.com/path?query",
767 uuri.toString());
768 }
769
770
771 /***
772 * Ensure that URI strings beginning with a colon are treated
773 * the same as browsers do (as relative, rather than as absolute
774 * with zero-length scheme).
775 *
776 * @throws URIException
777 */
778 public void testStartsWithColon() throws URIException {
779 UURI base = UURIFactory.getInstance("http://www.example.com/path/page");
780 UURI uuri = UURIFactory.getInstance(base,":foo");
781 assertEquals("derelativize starsWithColon",
782 uuri.getURI(),
783 "http://www.example.com/path/:foo");
784 }
785
786 /***
787 * Ensure that relative URIs with colons in late positions
788 * aren't mistakenly interpreted as absolute URIs with long,
789 * illegal schemes.
790 *
791 * @throws URIException
792 */
793 public void testLateColon() throws URIException {
794 UURI base = UURIFactory.getInstance("http://www.example.com/path/page");
795 UURI uuri1 = UURIFactory.getInstance(base,"example.html;jsessionid=deadbeef:deadbeed?parameter=this:value");
796 assertEquals("derelativize lateColon",
797 uuri1.getURI(),
798 "http://www.example.com/path/example.html;jsessionid=deadbeef:deadbeed?parameter=this:value");
799 UURI uuri2 = UURIFactory.getInstance(base,"example.html?parameter=this:value");
800 assertEquals("derelativize lateColon",
801 uuri2.getURI(),
802 "http://www.example.com/path/example.html?parameter=this:value");
803 }
804
805 /***
806 * Ensure that stray trailing '%' characters do not prevent
807 * UURI instances from being created, and are reasonably
808 * escaped when encountered.
809 *
810 * @throws URIException
811 */
812 public void testTrailingPercents() throws URIException {
813 String plainPath = "http://www.example.com/path%";
814 UURI plainPathUuri = UURIFactory.getInstance(plainPath);
815 assertEquals("plainPath getURI", plainPath, plainPathUuri.getURI());
816 assertEquals("plainPath getEscapedURI",
817 "http://www.example.com/path%", // browsers don't escape '%'
818 plainPathUuri.getEscapedURI());
819
820 String partiallyEscapedPath = "http://www.example.com/pa%20th%";
821 UURI partiallyEscapedPathUuri = UURIFactory.getInstance(
822 partiallyEscapedPath);
823
824
825
826
827 assertEquals("partiallyEscapedPath getEscapedURI",
828 "http://www.example.com/pa%20th%",
829 partiallyEscapedPathUuri.getEscapedURI());
830
831 String plainQueryString = "http://www.example.com/path?q=foo%";
832 UURI plainQueryStringUuri = UURIFactory.getInstance(
833 plainQueryString);
834
835
836
837 assertEquals("plainQueryString getEscapedURI",
838 "http://www.example.com/path?q=foo%",
839 plainQueryStringUuri.getEscapedURI());
840
841 String partiallyEscapedQueryString =
842 "http://www.example.com/pa%20th?q=foo%";
843 UURI partiallyEscapedQueryStringUuri = UURIFactory.getInstance(
844 partiallyEscapedQueryString);
845 assertEquals("partiallyEscapedQueryString getURI",
846 "http://www.example.com/pa th?q=foo%",
847 partiallyEscapedQueryStringUuri.getURI());
848 assertEquals("partiallyEscapedQueryString getEscapedURI",
849 "http://www.example.com/pa%20th?q=foo%",
850 partiallyEscapedQueryStringUuri.getEscapedURI());
851 }
852
853 /***
854 * Ensure that stray '%' characters do not prevent
855 * UURI instances from being created, and are reasonably
856 * escaped when encountered.
857 *
858 * @throws URIException
859 */
860 public void testStrayPercents() throws URIException {
861 String oneStray = "http://www.example.com/pa%th";
862 UURI oneStrayUuri = UURIFactory.getInstance(oneStray);
863 assertEquals("oneStray getURI", oneStray, oneStrayUuri.getURI());
864 assertEquals("oneStray getEscapedURI",
865 "http://www.example.com/pa%th", // browsers don't escape '%'
866 oneStrayUuri.getEscapedURI());
867
868 String precededByValidEscape = "http://www.example.com/pa%20th%way";
869 UURI precededByValidEscapeUuri = UURIFactory.getInstance(
870 precededByValidEscape);
871 assertEquals("precededByValidEscape getURI",
872 "http://www.example.com/pa th%way", // getURI interprets escapes
873 precededByValidEscapeUuri.getURI());
874 assertEquals("precededByValidEscape getEscapedURI",
875 "http://www.example.com/pa%20th%way",
876 precededByValidEscapeUuri.getEscapedURI());
877
878 String followedByValidEscape = "http://www.example.com/pa%th%20way";
879 UURI followedByValidEscapeUuri = UURIFactory.getInstance(
880 followedByValidEscape);
881 assertEquals("followedByValidEscape getURI",
882 "http://www.example.com/pa%th way", // getURI interprets escapes
883 followedByValidEscapeUuri.getURI());
884 assertEquals("followedByValidEscape getEscapedURI",
885 "http://www.example.com/pa%th%20way",
886 followedByValidEscapeUuri.getEscapedURI());
887 }
888
889 public void testEscapingNotNecessary() throws URIException {
890 String escapesUnnecessary =
891 "http://www.example.com/misc;reserved:chars@that&don't=need"
892 +"+escaping$even,though!you(might)initially?think#so";
893
894 String expected = escapesUnnecessary.substring(0, escapesUnnecessary
895 .length() - 3);
896 assertEquals("escapes unnecessary",
897 expected,
898 UURIFactory.getInstance(escapesUnnecessary).toString());
899 }
900
901 public void testIdn() throws URIException {
902
903 String idn1 = new String("http://räksmörgås.josefßon.org/");
904 String puny1 = "http://xn--rksmrgs-5wao1o.josefsson.org/";
905 assertEquals("encoding of " + idn1, puny1, UURIFactory
906 .getInstance(idn1).toString());
907 String idn2 = "http://www.pølse.dk/";
908 String puny2 = "http://www.xn--plse-gra.dk/";
909 assertEquals("encoding of " + idn2, puny2, UURIFactory
910 .getInstance(idn2).toString());
911 String idn3 = "http://例子.測試";
912 String puny3 = "http://xn--fsqu00a.xn--g6w251d/";
913 assertEquals("encoding of " + idn3, puny3, UURIFactory
914 .getInstance(idn3).toString());
915
916 }
917
918 public void testNewLineInURL() throws URIException {
919 UURI uuri = UURIFactory.getInstance("http://www.ar\rchive\n." +
920 "org/i\n\n\r\rndex.html");
921 assertEquals("http://www.archive.org/index.html", uuri.toString());
922 }
923
924 public void testTabsInURL() throws URIException {
925 UURI uuri = UURIFactory.getInstance("http://www.ar\tchive\t." +
926 "org/i\t\r\n\tndex.html");
927 assertEquals("http://www.archive.org/index.html", uuri.toString());
928 }
929
930 public void testQueryEscaping() throws URIException {
931 UURI uuri = UURIFactory.getInstance(
932 "http://www.yahoo.com/foo?somechars!@$%^&*()_-+={[}]|\'\";:/?.>,<");
933 assertEquals(
934
935 "http://www.yahoo.com/foo?somechars!@$%^&*()_-+={[}]|\'%22;:/?.%3E,%3C",
936 uuri.toString());
937 }
938
939 /***
940 * Check that our 'normalization' does same as Nutch's
941 * Below before-and-afters were taken from the nutch urlnormalizer-basic
942 * TestBasicURLNormalizer class (December 2006, Nutch 0.9-dev).
943 * @throws URIException
944 */
945 public void testSameAsNutchURLFilterBasic() throws URIException {
946 assertEquals(UURIFactory.getInstance(" http://foo.com/ ").toString(),
947 "http://foo.com/");
948
949
950 assertEquals(UURIFactory.getInstance("HTTP://foo.com/").toString(),
951 "http://foo.com/");
952
953
954 assertEquals(UURIFactory.
955 getInstance("http://Foo.Com/index.html").toString(),
956 "http://foo.com/index.html");
957 assertEquals(UURIFactory.
958 getInstance("http://Foo.Com/index.html").toString(),
959 "http://foo.com/index.html");
960
961
962 assertEquals(UURIFactory.
963 getInstance("http://foo.com:80/index.html").toString(),
964 "http://foo.com/index.html");
965 assertEquals(UURIFactory.getInstance("http://foo.com:81/").toString(),
966 "http://foo.com:81/");
967
968
969 assertEquals(UURIFactory.getInstance("http://foo.com").toString(),
970 "http://foo.com/");
971
972
973 assertEquals(UURIFactory.
974 getInstance("http://foo.com/foo.html#ref").toString(),
975 "http://foo.com/foo.html");
976
977
978
979
980
981 assertEquals(UURIFactory.
982 getInstance("http://foo.com/aa/../").toString(),
983 "http://foo.com/" );
984 assertEquals(UURIFactory.
985 getInstance("http://foo.com/aa/bb/../").toString(),
986 "http://foo.com/aa/");
987
988
989
990
991
992
993
994 assertEquals(UURIFactory.
995 getInstance("http://foo.com/aa/bb/cc/../../foo.html").toString(),
996 "http://foo.com/aa/foo.html");
997 assertEquals(UURIFactory.
998 getInstance("http://foo.com/aa/bb/../cc/dd/../ee/foo.html").
999 toString(),
1000 "http://foo.com/aa/cc/ee/foo.html");
1001 assertEquals(UURIFactory.
1002 getInstance("http://foo.com/../foo.html").toString(),
1003 "http://foo.com/foo.html" );
1004 assertEquals(UURIFactory.
1005 getInstance("http://foo.com/../../foo.html").toString(),
1006 "http://foo.com/foo.html" );
1007 assertEquals(UURIFactory.
1008 getInstance("http://foo.com/../aa/../foo.html").toString(),
1009 "http://foo.com/foo.html" );
1010 assertEquals(UURIFactory.
1011 getInstance("http://foo.com/aa/../../foo.html").toString(),
1012 "http://foo.com/foo.html" );
1013 assertEquals(UURIFactory.
1014 getInstance("http://foo.com/aa/../bb/../foo.html/../../").
1015 toString(),
1016 "http://foo.com/" );
1017 assertEquals(UURIFactory.getInstance("http://foo.com/../aa/foo.html").
1018 toString(), "http://foo.com/aa/foo.html" );
1019 assertEquals(UURIFactory.
1020 getInstance("http://foo.com/../aa/../foo.html").toString(),
1021 "http://foo.com/foo.html" );
1022 assertEquals(UURIFactory.
1023 getInstance("http://foo.com/a..a/foo.html").toString(),
1024 "http://foo.com/a..a/foo.html" );
1025 assertEquals(UURIFactory.
1026 getInstance("http://foo.com/a..a/../foo.html").toString(),
1027 "http://foo.com/foo.html" );
1028 assertEquals(UURIFactory.
1029 getInstance("http://foo.com/foo.foo/../foo.html").toString(),
1030 "http://foo.com/foo.html" );
1031 }
1032
1033 public void testHttpSchemeColonSlash() {
1034 boolean exception = false;
1035 try {
1036 UURIFactory.getInstance("https:/");
1037 } catch (URIException e) {
1038 exception = true;
1039 }
1040 assertTrue("Didn't throw exception when one expected", exception);
1041 exception = false;
1042 try {
1043 UURIFactory.getInstance("http://");
1044 } catch (URIException e) {
1045 exception = true;
1046 }
1047 assertTrue("Didn't throw exception when one expected", exception);
1048 }
1049
1050 public void testNakedHttpsSchemeColon() {
1051 boolean exception = false;
1052 try {
1053 UURIFactory.getInstance("https:");
1054 } catch (URIException e) {
1055 exception = true;
1056 }
1057 assertTrue("Didn't throw exception when one expected", exception);
1058 exception = false;
1059 try {
1060 UURI base = UURIFactory.getInstance("http://www.example.com");
1061 UURIFactory.getInstance(base, "https:");
1062 } catch (URIException e) {
1063 exception = true;
1064 }
1065 assertTrue("Didn't throw exception when one expected", exception);
1066 }
1067
1068 /***
1069 * Test motivated by [#HER-616] The UURI class may throw
1070 * NullPointerException in getReferencedHost()
1071 *
1072 * @throws URIException
1073 */
1074 public void testMissingHttpColon() throws URIException {
1075 String suspectUri = "http//www.test.foo";
1076 UURI base = UURIFactory.getInstance("http://www.example.com");
1077 boolean exceptionThrown = false;
1078 try {
1079 UURI badUuri = UURIFactory.getInstance(suspectUri);
1080 badUuri.getReferencedHost();
1081 } catch (URIException e) {
1082
1083 exceptionThrown = true;
1084 } finally {
1085 assertTrue("expected exception not thrown",exceptionThrown);
1086 }
1087 UURI goodUuri = UURIFactory.getInstance(base,suspectUri);
1088 goodUuri.getReferencedHost();
1089 }
1090
1091 /***
1092 * Test bad port throws URIException not NumberFormatException
1093 */
1094 public void testExtremePort() {
1095 try {
1096 UURI uuri = UURIFactory.getInstance("http://Tel.:016099117464");
1097 System.out.println(uuri);
1098 fail("expected exception not thrown");
1099 } catch (URIException ue){
1100
1101 }
1102 }
1103 }