1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.datamodel;
25
26 import java.io.IOException;
27 import java.io.ObjectInputStream;
28 import java.io.ObjectOutputStream;
29 import java.util.ArrayList;
30 import java.util.Collection;
31 import java.util.HashSet;
32 import java.util.Iterator;
33 import java.util.List;
34 import java.util.Set;
35 import java.util.concurrent.CopyOnWriteArrayList;
36
37 import org.apache.commons.httpclient.HttpStatus;
38 import org.apache.commons.httpclient.URIException;
39 import org.archive.crawler.datamodel.credential.CredentialAvatar;
40 import org.archive.crawler.datamodel.credential.Rfc2617Credential;
41 import org.archive.crawler.extractor.Link;
42 import org.archive.crawler.framework.Processor;
43 import org.archive.crawler.framework.ProcessorChain;
44 import org.archive.crawler.util.Transform;
45 import org.archive.net.UURI;
46 import org.archive.net.UURIFactory;
47 import org.archive.util.Base32;
48 import org.archive.util.HttpRecorder;
49
50 import st.ata.util.AList;
51 import st.ata.util.HashtableAList;
52
53
54 /***
55 * Represents a candidate URI and the associated state it
56 * collects as it is crawled.
57 *
58 * <p>Core state is in instance variables but a flexible
59 * attribute list is also available. Use this 'bucket' to carry
60 * custom processing extracted data and state across CrawlURI
61 * processing. See the {@link #putString(String, String)},
62 * {@link #getString(String)}, etc.
63 *
64 * @author Gordon Mohr
65 */
66 public class CrawlURI extends CandidateURI
67 implements FetchStatusCodes {
68
69 private static final long serialVersionUID = 7874096757350100472L;
70
71 public static final int UNCALCULATED = -1;
72
73
74
75
76
77
78
79
80
81 transient private Processor nextProcessor;
82 transient private ProcessorChain nextProcessorChain;
83 private int fetchStatus = 0;
84 private int deferrals = 0;
85 private int fetchAttempts = 0;
86 transient private int threadNumber;
87
88
89 /*** @deprecated */
90 private int linkHopCount = UNCALCULATED;
91 /*** @deprecated */
92 private int embedHopCount = UNCALCULATED;
93
94
95 private String userAgent = null;
96
97
98
99 transient private boolean linkExtractorFinished = false;
100
101 /***
102 * Protection against outlink overflow.
103 * Change value by setting alternate maximum in heritrix.properties.
104 */
105 public static final int MAX_OUTLINKS = Integer.
106 parseInt(System.getProperty(CrawlURI.class.getName() + ".maxOutLinks",
107 "6000"));
108
109 transient private int discardedOutlinks = 0;
110
111
112 private long contentSize = UNCALCULATED;
113 private long contentLength = UNCALCULATED;
114
115 /***
116 * Current http recorder.
117 *
118 * Gets set upon successful request. Reset at start of processing chain.
119 */
120 private transient HttpRecorder httpRecorder = null;
121
122 /***
123 * Content type of a successfully fetched URI.
124 *
125 * May be null even on successfully fetched URI.
126 */
127 private String contentType = null;
128
129 /***
130 * True if this CrawlURI has been deemed a prerequisite by the
131 * {@link org.archive.crawler.prefetch.PreconditionEnforcer}.
132 *
133 * This flag is used at least inside in the precondition enforcer so that
134 * subsequent prerequisite tests know to let this CrawlURI through because
135 * its a prerequisite needed by an earlier prerequisite tests (e.g. If
136 * this is a robots.txt, then the subsequent login credentials prereq
137 * test must not throw it out because its not a login curi).
138 */
139 private boolean prerequisite = false;
140
141 /***
142 * Set to true if this <code>curi</code> is to be POST'd rather than GET-d.
143 */
144 private boolean post = false;
145
146 /***
147 * Monotonically increasing number within a crawl;
148 * useful for tending towards breadth-first ordering.
149 * Will sometimes be truncated to 48 bits, so behavior
150 * over 281 trillion instantiated CrawlURIs may be
151 * buggy
152 */
153 protected long ordinal;
154
155 /***
156 * Cache of this candidate uuri as a string.
157 *
158 * Profiling shows us spending about 1-2% of total elapsed time in
159 * toString.
160 */
161 private String cachedCrawlURIString = null;
162
163 /***
164 * Array to hold keys of alist members that persist across URI processings.
165 * Any key mentioned in this list will not be cleared out at the end
166 * of a pass down the processing chain.
167 */
168 private static final List<Object> alistPersistentMember
169 = new CopyOnWriteArrayList<Object>(
170 new String [] {A_CREDENTIAL_AVATARS_KEY});
171
172 /***
173 * A digest (hash, usually SHA1) of retrieved content-body.
174 *
175 */
176 private byte[] contentDigest = null;
177 private String contentDigestScheme = null;
178
179
180 /***
181 * Create a new instance of CrawlURI from a {@link UURI}.
182 *
183 * @param uuri the UURI to base this CrawlURI on.
184 */
185 public CrawlURI(UURI uuri) {
186 super(uuri);
187 }
188
189 /***
190 * Create a new instance of CrawlURI from a {@link CandidateURI}
191 *
192 * @param caUri the CandidateURI to base this CrawlURI on.
193 * @param o Monotonically increasing number within a crawl.
194 */
195 @SuppressWarnings("deprecation")
196 public CrawlURI(CandidateURI caUri, long o) {
197 super(caUri.getUURI(), caUri.getPathFromSeed(), caUri.getVia(),
198 caUri.getViaContext());
199 ordinal = o;
200 setIsSeed(caUri.isSeed());
201 setSchedulingDirective(caUri.getSchedulingDirective());
202 setAList(caUri.getAList());
203 }
204
205 /***
206 * Takes a status code and converts it into a human readable string.
207 *
208 * @param code the status code
209 * @return a human readable string declaring what the status code is.
210 */
211 public static String fetchStatusCodesToString(int code){
212 switch(code){
213
214 case S_DNS_SUCCESS : return "DNS-1-OK";
215
216 case 100 : return "HTTP-100-Info-Continue";
217 case 101 : return "HTTP-101-Info-Switching Protocols";
218
219 case 200 : return "HTTP-200-Success-OK";
220 case 201 : return "HTTP-201-Success-Created";
221 case 202 : return "HTTP-202-Success-Accepted";
222 case 203 : return "HTTP-203-Success-Non-Authoritative";
223 case 204 : return "HTTP-204-Success-No Content ";
224 case 205 : return "HTTP-205-Success-Reset Content";
225 case 206 : return "HTTP-206-Success-Partial Content";
226
227 case 300 : return "HTTP-300-Redirect-Multiple Choices";
228 case 301 : return "HTTP-301-Redirect-Moved Permanently";
229 case 302 : return "HTTP-302-Redirect-Found";
230 case 303 : return "HTTP-303-Redirect-See Other";
231 case 304 : return "HTTP-304-Redirect-Not Modified";
232 case 305 : return "HTTP-305-Redirect-Use Proxy";
233 case 307 : return "HTTP-307-Redirect-Temporary Redirect";
234
235 case 400 : return "HTTP-400-ClientErr-Bad Request";
236 case 401 : return "HTTP-401-ClientErr-Unauthorized";
237 case 402 : return "HTTP-402-ClientErr-Payment Required";
238 case 403 : return "HTTP-403-ClientErr-Forbidden";
239 case 404 : return "HTTP-404-ClientErr-Not Found";
240 case 405 : return "HTTP-405-ClientErr-Method Not Allowed";
241 case 407 : return "HTTP-406-ClientErr-Not Acceptable";
242 case 408 : return "HTTP-407-ClientErr-Proxy Authentication Required";
243 case 409 : return "HTTP-408-ClientErr-Request Timeout";
244 case 410 : return "HTTP-409-ClientErr-Conflict";
245 case 406 : return "HTTP-410-ClientErr-Gone";
246 case 411 : return "HTTP-411-ClientErr-Length Required";
247 case 412 : return "HTTP-412-ClientErr-Precondition Failed";
248 case 413 : return "HTTP-413-ClientErr-Request Entity Too Large";
249 case 414 : return "HTTP-414-ClientErr-Request-URI Too Long";
250 case 415 : return "HTTP-415-ClientErr-Unsupported Media Type";
251 case 416 : return "HTTP-416-ClientErr-Requested Range Not Satisfiable";
252 case 417 : return "HTTP-417-ClientErr-Expectation Failed";
253
254 case 500 : return "HTTP-500-ServerErr-Internal Server Error";
255 case 501 : return "HTTP-501-ServerErr-Not Implemented";
256 case 502 : return "HTTP-502-ServerErr-Bad Gateway";
257 case 503 : return "HTTP-503-ServerErr-Service Unavailable";
258 case 504 : return "HTTP-504-ServerErr-Gateway Timeout";
259 case 505 : return "HTTP-505-ServerErr-HTTP Version Not Supported";
260
261 case S_BLOCKED_BY_USER:
262 return "Heritrix(" + S_BLOCKED_BY_USER + ")-Blocked by user";
263 case S_BLOCKED_BY_CUSTOM_PROCESSOR:
264 return "Heritrix(" + S_BLOCKED_BY_CUSTOM_PROCESSOR +
265 ")-Blocked by custom prefetch processor";
266 case S_DELETED_BY_USER:
267 return "Heritrix(" + S_DELETED_BY_USER + ")-Deleted by user";
268 case S_CONNECT_FAILED:
269 return "Heritrix(" + S_CONNECT_FAILED + ")-Connection failed";
270 case S_CONNECT_LOST:
271 return "Heritrix(" + S_CONNECT_LOST + ")-Connection lost";
272 case S_DEEMED_CHAFF:
273 return "Heritrix(" + S_DEEMED_CHAFF + ")-Deemed chaff";
274 case S_DEFERRED:
275 return "Heritrix(" + S_DEFERRED + ")-Deferred";
276 case S_DOMAIN_UNRESOLVABLE:
277 return "Heritrix(" + S_DOMAIN_UNRESOLVABLE
278 + ")-Domain unresolvable";
279 case S_OUT_OF_SCOPE:
280 return "Heritrix(" + S_OUT_OF_SCOPE + ")-Out of scope";
281 case S_DOMAIN_PREREQUISITE_FAILURE:
282 return "Heritrix(" + S_DOMAIN_PREREQUISITE_FAILURE
283 + ")-Domain prerequisite failure";
284 case S_ROBOTS_PREREQUISITE_FAILURE:
285 return "Heritrix(" + S_ROBOTS_PREREQUISITE_FAILURE
286 + ")-Robots prerequisite failure";
287 case S_OTHER_PREREQUISITE_FAILURE:
288 return "Heritrix(" + S_OTHER_PREREQUISITE_FAILURE
289 + ")-Other prerequisite failure";
290 case S_PREREQUISITE_UNSCHEDULABLE_FAILURE:
291 return "Heritrix(" + S_PREREQUISITE_UNSCHEDULABLE_FAILURE
292 + ")-Prerequisite unschedulable failure";
293 case S_ROBOTS_PRECLUDED:
294 return "Heritrix(" + S_ROBOTS_PRECLUDED + ")-Robots precluded";
295 case S_RUNTIME_EXCEPTION:
296 return "Heritrix(" + S_RUNTIME_EXCEPTION
297 + ")-Runtime exception";
298 case S_SERIOUS_ERROR:
299 return "Heritrix(" + S_SERIOUS_ERROR + ")-Serious error";
300 case S_TIMEOUT:
301 return "Heritrix(" + S_TIMEOUT + ")-Timeout";
302 case S_TOO_MANY_EMBED_HOPS:
303 return "Heritrix(" + S_TOO_MANY_EMBED_HOPS
304 + ")-Too many embed hops";
305 case S_TOO_MANY_LINK_HOPS:
306 return "Heritrix(" + S_TOO_MANY_LINK_HOPS
307 + ")-Too many link hops";
308 case S_TOO_MANY_RETRIES:
309 return "Heritrix(" + S_TOO_MANY_RETRIES + ")-Too many retries";
310 case S_UNATTEMPTED:
311 return "Heritrix(" + S_UNATTEMPTED + ")-Unattempted";
312 case S_UNFETCHABLE_URI:
313 return "Heritrix(" + S_UNFETCHABLE_URI + ")-Unfetchable URI";
314 case S_PROCESSING_THREAD_KILLED:
315 return "Heritrix(" + S_PROCESSING_THREAD_KILLED + ")-" +
316 "Processing thread killed";
317
318 default : return Integer.toString(code);
319 }
320 }
321
322
323 /***
324 * Return the overall/fetch status of this CrawlURI for its
325 * current trip through the processing loop.
326 *
327 * @return a value from FetchStatusCodes
328 */
329 public int getFetchStatus(){
330 return fetchStatus;
331 }
332
333 /***
334 * Set the overall/fetch status of this CrawlURI for
335 * its current trip through the processing loop.
336 *
337 * @param newstatus a value from FetchStatusCodes
338 */
339 public void setFetchStatus(int newstatus){
340 fetchStatus = newstatus;
341 }
342
343 /***
344 * Get the number of attempts at getting the document referenced by this
345 * URI.
346 *
347 * @return the number of attempts at getting the document referenced by this
348 * URI.
349 */
350 public int getFetchAttempts() {
351 return fetchAttempts;
352 }
353
354 /***
355 * Increment the number of attempts at getting the document referenced by
356 * this URI.
357 *
358 * @return the number of attempts at getting the document referenced by this
359 * URI.
360 */
361 public int incrementFetchAttempts() {
362
363 return fetchAttempts++;
364 }
365
366 /***
367 * Reset fetchAttempts counter.
368 */
369 public void resetFetchAttempts() {
370 this.fetchAttempts = 0;
371 }
372
373 /***
374 * Reset deferrals counter.
375 */
376 public void resetDeferrals() {
377 this.deferrals = 0;
378 }
379
380 /***
381 * Get the next processor to process this URI.
382 *
383 * @return the processor that should process this URI next.
384 */
385 public Processor nextProcessor() {
386 return nextProcessor;
387 }
388
389 /***
390 * Get the processor chain that should be processing this URI after the
391 * current chain is finished with it.
392 *
393 * @return the next processor chain to process this URI.
394 */
395 public ProcessorChain nextProcessorChain() {
396 return nextProcessorChain;
397 }
398
399 /***
400 * Set the next processor to process this URI.
401 *
402 * @param processor the next processor to process this URI.
403 */
404 public void setNextProcessor(Processor processor) {
405 nextProcessor = processor;
406 }
407
408 /***
409 * Set the next processor chain to process this URI.
410 *
411 * @param nextProcessorChain the next processor chain to process this URI.
412 */
413 public void setNextProcessorChain(ProcessorChain nextProcessorChain) {
414 this.nextProcessorChain = nextProcessorChain;
415 }
416
417 /***
418 * Do all actions associated with setting a <code>CrawlURI</code> as
419 * requiring a prerequisite.
420 *
421 * @param lastProcessorChain Last processor chain reference. This chain is
422 * where this <code>CrawlURI</code> goes next.
423 * @param preq Object to set a prerequisite.
424 * @throws URIException
425 */
426 public void markPrerequisite(String preq,
427 ProcessorChain lastProcessorChain) throws URIException {
428 Link link = createLink(preq,Link.PREREQ_MISC,Link.PREREQ_HOP);
429 setPrerequisiteUri(link);
430 incrementDeferrals();
431 setFetchStatus(S_DEFERRED);
432 skipToProcessorChain(lastProcessorChain);
433 }
434
435 /***
436 * Set a prerequisite for this URI.
437 * <p>
438 * A prerequisite is a URI that must be crawled before this URI can be
439 * crawled.
440 *
441 * @param link Link to set as prereq.
442 */
443 public void setPrerequisiteUri(Object link) {
444 putObject(A_PREREQUISITE_URI, link);
445 }
446
447 /***
448 * Get the prerequisite for this URI.
449 * <p>
450 * A prerequisite is a URI that must be crawled before this URI can be
451 * crawled.
452 *
453 * @return the prerequisite for this URI or null if no prerequisite.
454 */
455 public Object getPrerequisiteUri() {
456 return getObject(A_PREREQUISITE_URI);
457 }
458
459 /***
460 * @return True if this CrawlURI has a prerequisite.
461 */
462 public boolean hasPrerequisiteUri() {
463 return containsKey(A_PREREQUISITE_URI);
464 }
465
466 /***
467 * Returns true if this CrawlURI is a prerequisite.
468 *
469 * @return true if this CrawlURI is a prerequisite.
470 */
471 public boolean isPrerequisite() {
472 return this.prerequisite;
473 }
474
475 /***
476 * Set if this CrawlURI is itself a prerequisite URI.
477 *
478 * @param prerequisite True if this CrawlURI is itself a prerequiste uri.
479 */
480 public void setPrerequisite(boolean prerequisite) {
481 this.prerequisite = prerequisite;
482 }
483
484 /***
485 * @return This crawl URI as a string wrapped with 'CrawlURI(' +
486 * ')'.
487 */
488 public String getCrawlURIString() {
489 if (this.cachedCrawlURIString == null) {
490 synchronized (this) {
491 if (this.cachedCrawlURIString == null) {
492 this.cachedCrawlURIString =
493 "CrawlURI(" + toString() + ")";
494 }
495 }
496 }
497 return this.cachedCrawlURIString;
498 }
499
500 /***
501 * Get the content type of this URI.
502 *
503 * @return Fetched URIs content type. May be null.
504 */
505 public String getContentType() {
506 return this.contentType;
507 }
508
509 /***
510 * Set a fetched uri's content type.
511 *
512 * @param ct Contenttype. May be null.
513 */
514 public void setContentType(String ct) {
515 this.contentType = ct;
516 }
517
518 /***
519 * Set the number of the ToeThread responsible for processing this uri.
520 *
521 * @param i the ToeThread number.
522 */
523 public void setThreadNumber(int i) {
524 threadNumber = i;
525 }
526
527 /***
528 * Get the number of the ToeThread responsible for processing this uri.
529 *
530 * @return the ToeThread number.
531 */
532 public int getThreadNumber() {
533 return threadNumber;
534 }
535
536 /***
537 * Increment the deferral count.
538 *
539 */
540 public void incrementDeferrals() {
541 deferrals++;
542 }
543
544 /***
545 * Get the deferral count.
546 *
547 * @return the deferral count.
548 */
549 public int getDeferrals() {
550 return deferrals;
551 }
552
553 /***
554 * Remove all attributes set on this uri.
555 * <p>
556 * This methods removes the attribute list.
557 */
558 public void stripToMinimal() {
559 clearAList();
560 }
561
562 /***
563 * Get the size in bytes of this URI's recorded content, inclusive
564 * of things like protocol headers. It is the responsibility of the
565 * classes which fetch the URI to set this value accordingly -- it is
566 * not calculated/verified within CrawlURI.
567 *
568 * This value is consulted in reporting/logging/writing-decisions.
569 *
570 * @see #setContentSize()
571 * @return contentSize
572 */
573 public long getContentSize(){
574 return contentSize;
575 }
576
577 /***
578 * Make note of a non-fatal error, local to a particular Processor,
579 * which should be logged somewhere, but allows processing to continue.
580 *
581 * This is how you add to the local-error log (the 'localized' in
582 * the below is making an error local rather than global, not
583 * making a swiss-french version of the error.).
584 *
585 * @param processorName Name of processor the exception was thrown
586 * in.
587 * @param ex Throwable to log.
588 * @param message Extra message to log beyond exception message.
589 */
590 public void addLocalizedError(final String processorName,
591 final Throwable ex, final String message) {
592 List<LocalizedError> localizedErrors;
593 if (containsKey(A_LOCALIZED_ERRORS)) {
594 @SuppressWarnings("unchecked")
595 List<LocalizedError> temp
596 = (List<LocalizedError>) getObject(A_LOCALIZED_ERRORS);
597 localizedErrors = temp;
598 } else {
599 localizedErrors = new ArrayList<LocalizedError>();
600 putObject(A_LOCALIZED_ERRORS, localizedErrors);
601 }
602
603 localizedErrors.add(new LocalizedError(processorName, ex, message));
604 addAnnotation("le:" + getClassSimpleName(ex.getClass()) + "@" +
605 processorName);
606 }
607
608
609 protected String getClassSimpleName(final Class c) {
610 String classname = c.getName();
611 int index = classname.lastIndexOf('.');
612 return ((index > 0 && (index + 1) < classname.length())?
613 classname.substring(index + 1): classname);
614 }
615
616 /***
617 * Add an annotation: an abbrieviated indication of something special
618 * about this URI that need not be present in every crawl.log line,
619 * but should be noted for future reference.
620 *
621 * @param annotation the annotation to add; should not contain
622 * whitespace or a comma
623 */
624 public void addAnnotation(String annotation) {
625 String annotations;
626 if(containsKey(A_ANNOTATIONS)) {
627 annotations = getString(A_ANNOTATIONS);
628 annotations += ","+annotation;
629 } else {
630 annotations = annotation;
631 }
632
633 putString(A_ANNOTATIONS,annotations);
634 }
635
636 /***
637 * TODO: Implement truncation using booleans rather than as this
638 * ugly String parse.
639 * @return True if fetch was truncated.
640 */
641 public boolean isTruncatedFetch() {
642 return annotationContains(TRUNC_SUFFIX);
643 }
644
645 public boolean isLengthTruncatedFetch() {
646 return annotationContains(LENGTH_TRUNC);
647 }
648
649 public boolean isTimeTruncatedFetch() {
650 return annotationContains(TIMER_TRUNC);
651 }
652
653 public boolean isHeaderTruncatedFetch() {
654 return annotationContains(HEADER_TRUNC);
655 }
656
657 protected boolean annotationContains(final String str2Find) {
658 boolean result = false;
659 if (!containsKey(A_ANNOTATIONS)) {
660 return result;
661 }
662 String annotations = getString(A_ANNOTATIONS);
663 if (annotations != null && annotations.length() > 0) {
664 result = annotations.indexOf(str2Find) >= 0;
665 }
666 return result;
667 }
668
669 /***
670 * Get the annotations set for this uri.
671 *
672 * @return the annotations set for this uri.
673 */
674 public String getAnnotations() {
675 return (containsKey(A_ANNOTATIONS))?
676 getString(A_ANNOTATIONS): null;
677 }
678
679 /***
680 * Get the embeded hop count.
681 *
682 * @return the embeded hop count.
683 * @deprecated
684 */
685 public int getEmbedHopCount() {
686 return embedHopCount;
687 }
688
689 /***
690 * Get the link hop count.
691 *
692 * @return the link hop count.
693 * @deprecated
694 */
695 public int getLinkHopCount() {
696 return linkHopCount;
697 }
698
699 /***
700 * Mark this uri as being a seed.
701 *
702 *
703 * @deprecated
704 */
705 public void markAsSeed() {
706 linkHopCount = 0;
707 embedHopCount = 0;
708 }
709
710 /***
711 * Get the user agent to use for crawling this URI.
712 *
713 * If null the global setting should be used.
714 *
715 * @return user agent or null
716 */
717 public String getUserAgent() {
718 return userAgent;
719 }
720
721 /***
722 * Set the user agent to use when crawling this URI.
723 *
724 * If not set the global settings should be used.
725 *
726 * @param string user agent to use
727 */
728 public void setUserAgent(String string) {
729 userAgent = string;
730 }
731
732 /***
733 * Set which processor should be the next processor to process this uri
734 * instead of using the default next processor.
735 *
736 * @param processorChain the processor chain to skip to.
737 * @param processor the processor in the processor chain to skip to.
738 */
739 public void skipToProcessor(ProcessorChain processorChain,
740 Processor processor) {
741 setNextProcessorChain(processorChain);
742 setNextProcessor(processor);
743 }
744
745 /***
746 * Set which processor chain should be processing this uri next.
747 *
748 * @param processorChain the processor chain to skip to.
749 */
750 public void skipToProcessorChain(ProcessorChain processorChain) {
751 setNextProcessorChain(processorChain);
752 setNextProcessor(null);
753 }
754
755 /***
756 * For completed HTTP transactions, the length of the content-body.
757 *
758 * @return For completed HTTP transactions, the length of the content-body.
759 */
760 public long getContentLength() {
761 if (this.contentLength < 0) {
762 this.contentLength = (getHttpRecorder() != null)?
763 getHttpRecorder().getResponseContentLength(): 0;
764 }
765 return this.contentLength;
766 }
767
768 /***
769 * Get size of data recorded (transferred)
770 *
771 * @return recorded data size
772 */
773 public long getRecordedSize() {
774 return (getHttpRecorder() != null)
775 ? getHttpRecorder().getRecordedInput().getSize()
776
777 : getContentSize();
778 }
779
780 /***
781 * Sets the 'content size' for the URI, which is considered inclusive
782 * of all recorded material (such as protocol headers) or even material
783 * 'virtually' considered (as in material from a previous fetch
784 * confirmed unchanged with a server). (In contrast, content-length
785 * matches the HTTP definition, that of the enclosed content-body.)
786 *
787 * Should be set by a fetcher or other processor as soon as the final
788 * size of recorded content is known. Setting to an artificial/incorrect
789 * value may affect other reporting/processing.
790 *
791 * @param l Content size.
792 */
793 public void setContentSize(long l) {
794 contentSize = l;
795 }
796
797 /***
798 * If true then a link extractor has already claimed this CrawlURI and
799 * performed link extraction on the document content. This does not
800 * preclude other link extractors that may have an interest in this
801 * CrawlURI from also doing link extraction.
802 *
803 * <p>There is an onus on link extractors to set this flag if they have
804 * run.
805 *
806 * @return True if a processor has performed link extraction on this
807 * CrawlURI
808 *
809 * @see #linkExtractorFinished()
810 */
811 public boolean hasBeenLinkExtracted(){
812 return linkExtractorFinished;
813 }
814
815 /***
816 * Note that link extraction has been performed on this CrawlURI. A processor
817 * doing link extraction should invoke this method once it has finished it's
818 * work. It should invoke it even if no links are extracted. It should only
819 * invoke this method if the link extraction was performed on the document
820 * body (not the HTTP headers etc.).
821 *
822 * @see #hasBeenLinkExtracted()
823 */
824 public void linkExtractorFinished() {
825 linkExtractorFinished = true;
826 if(discardedOutlinks>0) {
827 addAnnotation("dol:"+discardedOutlinks);
828 }
829 }
830
831 /***
832 * Notify CrawlURI it is about to be logged; opportunity
833 * for self-annotation
834 */
835 public void aboutToLog() {
836 if (fetchAttempts>1) {
837 addAnnotation(fetchAttempts+"t");
838 }
839 }
840
841 /***
842 * Get the http recorder associated with this uri.
843 *
844 * @return Returns the httpRecorder. May be null but its set early in
845 * FetchHttp so there is an issue if its null.
846 */
847 public HttpRecorder getHttpRecorder() {
848 return httpRecorder;
849 }
850
851 /***
852 * Set the http recorder to be associated with this uri.
853 *
854 * @param httpRecorder The httpRecorder to set.
855 */
856 public void setHttpRecorder(HttpRecorder httpRecorder) {
857 this.httpRecorder = httpRecorder;
858 }
859
860 /***
861 * Return true if this is a http transaction.
862 *
863 * TODO: Compound this and {@link #isPost()} method so that there is one
864 * place to go to find out if get http, post http, ftp, dns.
865 *
866 * @return True if this is a http transaction.
867 */
868 public boolean isHttpTransaction() {
869 return containsKey(A_HTTP_TRANSACTION);
870 }
871
872 /***
873 * Clean up after a run through the processing chain.
874 *
875 * Called on the end of processing chain by Frontier#finish. Null out any
876 * state gathered during processing.
877 */
878 public void processingCleanup() {
879 this.httpRecorder = null;
880 this.fetchStatus = S_UNATTEMPTED;
881 this.setPrerequisite(false);
882 this.contentSize = UNCALCULATED;
883 this.contentLength = UNCALCULATED;
884
885 this.linkExtractorFinished = false;
886
887 setAList(getPersistentAList());
888 }
889
890 public AList getPersistentAList() {
891 AList newAList = new HashtableAList();
892
893 if(alistPersistentMember!=null && alistPersistentMember.size() > 0) {
894 newAList.copyKeysFrom(alistPersistentMember.iterator(), getAList());
895 }
896
897 List heritableKeys = (List) getObject(A_HERITABLE_KEYS);
898 if(heritableKeys!=null) {
899 newAList.copyKeysFrom(heritableKeys.iterator(), getAList());
900 }
901 return newAList;
902 }
903
904 /***
905 * Make a <code>CrawlURI</code> from the passed <code>CandidateURI</code>.
906 *
907 * Its safe to pass a CrawlURI instance. In this case we just return it
908 * as a result. Otherwise, we create new CrawlURI instance.
909 *
910 * @param caUri Candidate URI.
911 * @param ordinal
912 * @return A crawlURI made from the passed CandidateURI.
913 */
914 public static CrawlURI from(CandidateURI caUri, long ordinal) {
915 return (caUri instanceof CrawlURI)?
916 (CrawlURI)caUri: new CrawlURI(caUri, ordinal);
917 }
918
919 /***
920 * @param avatars Credential avatars to save off.
921 */
922 private void setCredentialAvatars(Set avatars) {
923 putObject(A_CREDENTIAL_AVATARS_KEY, avatars);
924 }
925
926 /***
927 * @return Credential avatars. Null if none set.
928 */
929 @SuppressWarnings("unchecked")
930 public Set<CredentialAvatar> getCredentialAvatars() {
931 return (Set)getObject(A_CREDENTIAL_AVATARS_KEY);
932 }
933
934 /***
935 * @return True if there are avatars attached to this instance.
936 */
937 public boolean hasCredentialAvatars() {
938 return getCredentialAvatars() != null &&
939 getCredentialAvatars().size() > 0;
940 }
941
942 /***
943 * Add an avatar.
944 *
945 * We do lazy instantiation.
946 *
947 * @param ca Credential avatar to add to set of avatars.
948 */
949 public void addCredentialAvatar(CredentialAvatar ca) {
950 Set<CredentialAvatar> avatars = getCredentialAvatars();
951 if (avatars == null) {
952 avatars = new HashSet<CredentialAvatar>();
953 setCredentialAvatars(avatars);
954 }
955 avatars.add(ca);
956 }
957
958 /***
959 * Remove all credential avatars from this crawl uri.
960 */
961 public void removeCredentialAvatars() {
962 if (hasCredentialAvatars()) {
963 remove(A_CREDENTIAL_AVATARS_KEY);
964 }
965 }
966
967 /***
968 * Remove all credential avatars from this crawl uri.
969 * @param ca Avatar to remove.
970 * @return True if we removed passed parameter. False if no operation
971 * performed.
972 */
973 public boolean removeCredentialAvatar(CredentialAvatar ca) {
974 boolean result = false;
975 Set avatars = getCredentialAvatars();
976 if (avatars != null && avatars.size() > 0) {
977 result = avatars.remove(ca);
978 }
979 return result;
980 }
981
982 /***
983 * Ask this URI if it was a success or not.
984 *
985 * Only makes sense to call this method after execution of
986 * HttpMethod#execute. Regard any status larger then 0 as success
987 * except for below caveat regarding 401s. Use {@link #is2XXSuccess()} if
988 * looking for a status code in the 200 range.
989 *
990 * <p>401s caveat: If any rfc2617 credential data present and we got a 401
991 * assume it got loaded in FetchHTTP on expectation that we're to go around
992 * the processing chain again. Report this condition as a failure so we
993 * get another crack at the processing chain only this time we'll be making
994 * use of the loaded credential data.
995 *
996 * @return True if ths URI has been successfully processed.
997 * @see #is2XXSuccess()
998 */
999 public boolean isSuccess() {
1000 boolean result = false;
1001 int statusCode = this.fetchStatus;
1002 if (statusCode == HttpStatus.SC_UNAUTHORIZED &&
1003 hasRfc2617CredentialAvatar()) {
1004 result = false;
1005 } else {
1006 result = (statusCode > 0);
1007 }
1008 return result;
1009 }
1010
1011 /***
1012 * @return True if status code is in the 2xx range.
1013 * @see #isSuccess()
1014 */
1015 public boolean is2XXSuccess() {
1016 return this.fetchStatus >= 200 && this.fetchStatus < 300;
1017 }
1018
1019 /***
1020 * @return True if we have an rfc2617 payload.
1021 */
1022 public boolean hasRfc2617CredentialAvatar() {
1023 boolean result = false;
1024 Set avatars = getCredentialAvatars();
1025 if (avatars != null && avatars.size() > 0) {
1026 for (Iterator i = avatars.iterator(); i.hasNext();) {
1027 if (((CredentialAvatar)i.next()).
1028 match(Rfc2617Credential.class)) {
1029 result = true;
1030 break;
1031 }
1032 }
1033 }
1034 return result;
1035 }
1036
1037 /***
1038 * Set whether this URI should be fetched by sending a HTTP POST request.
1039 * Else a HTTP GET request will be used.
1040 *
1041 * @param b Set whether this curi is to be POST'd. Else its to be GET'd.
1042 */
1043 public void setPost(boolean b) {
1044 this.post = b;
1045 }
1046
1047 /***
1048 * Returns true if this URI should be fetched by sending a HTTP POST request.
1049 *
1050 *
1051 * TODO: Compound this and {@link #isHttpTransaction()} method so that there
1052 * is one place to go to find out if get http, post http, ftp, dns.
1053 *
1054 * @return Returns is this CrawlURI instance is to be posted.
1055 */
1056 public boolean isPost() {
1057 return this.post;
1058 }
1059
1060 /***
1061 * Set the retained content-digest value (usu. SHA1).
1062 *
1063 * @param digestValue
1064 * @deprecated Use {@link #setContentDigest(String scheme, byte[])}
1065 */
1066 public void setContentDigest(byte[] digestValue) {
1067 setContentDigest("SHA1", digestValue);
1068 }
1069
1070 public void setContentDigest(final String scheme,
1071 final byte [] digestValue) {
1072 this.contentDigest = digestValue;
1073 this.contentDigestScheme = scheme;
1074 }
1075
1076 public String getContentDigestSchemeString() {
1077 if(this.contentDigest==null) {
1078 return null;
1079 }
1080 return this.contentDigestScheme + ":" + getContentDigestString();
1081 }
1082
1083 /***
1084 * Return the retained content-digest value, if any.
1085 *
1086 * @return Digest value.
1087 */
1088 public Object getContentDigest() {
1089 return contentDigest;
1090 }
1091
1092 public String getContentDigestString() {
1093 if(this.contentDigest==null) {
1094 return null;
1095 }
1096 return Base32.encode(this.contentDigest);
1097 }
1098
1099 transient Object holder;
1100 transient Object holderKey;
1101
1102 /***
1103 * Remember a 'holder' to which some enclosing/queueing
1104 * facility has assigned this CrawlURI
1105 * .
1106 * @param obj
1107 */
1108 public void setHolder(Object obj) {
1109 holder=obj;
1110 }
1111
1112 /***
1113 * Return the 'holder' for the convenience of
1114 * an external facility.
1115 *
1116 * @return holder
1117 */
1118 public Object getHolder() {
1119 return holder;
1120 }
1121
1122 /***
1123 * Remember a 'holderKey' which some enclosing/queueing
1124 * facility has assigned this CrawlURI
1125 * .
1126 * @param obj
1127 */
1128 public void setHolderKey(Object obj) {
1129 holderKey=obj;
1130 }
1131 /***
1132 * Return the 'holderKey' for convenience of
1133 * an external facility (Frontier).
1134 *
1135 * @return holderKey
1136 */
1137 public Object getHolderKey() {
1138 return holderKey;
1139 }
1140
1141 /***
1142 * Get the ordinal (serial number) assigned at creation.
1143 *
1144 * @return ordinal
1145 */
1146 public long getOrdinal() {
1147 return ordinal;
1148 }
1149
1150 /*** spot for an integer cost to be placed by external facility (frontier).
1151 * cost is truncated to 8 bits at times, so should not exceed 255 */
1152 int holderCost = UNCALCULATED;
1153 /***
1154 * Return the 'holderCost' for convenience of external facility (frontier)
1155 * @return value of holderCost
1156 */
1157 public int getHolderCost() {
1158 return holderCost;
1159 }
1160
1161 /***
1162 * Remember a 'holderCost' which some enclosing/queueing
1163 * facility has assigned this CrawlURI
1164 * @param cost value to remember
1165 */
1166 public void setHolderCost(int cost) {
1167 holderCost = cost;
1168 }
1169
1170 /***
1171 * All discovered outbound Links (navlinks, embeds, etc.)
1172 * Can either contain Link instances or CandidateURI instances, or both.
1173 * The LinksScoper processor converts Link instances in this collection
1174 * to CandidateURI instances.
1175 */
1176 transient Collection<Object> outLinks = new ArrayList<Object>();
1177
1178 /***
1179 * Returns discovered links. The returned collection might be empty if
1180 * no links were discovered, or if something like LinksScoper promoted
1181 * the links to CandidateURIs.
1182 *
1183 * Elements can be removed from the returned collection, but not added.
1184 * To add a discovered link, use one of the createAndAdd methods or
1185 * {@link #getOutObjects()}.
1186 *
1187 * @return Collection of all discovered outbound Links
1188 */
1189 public Collection<Link> getOutLinks() {
1190 return Transform.subclasses(outLinks, Link.class);
1191 }
1192
1193 /***
1194 * Returns discovered candidate URIs. The returned collection will be
1195 * emtpy until something like LinksScoper promotes discovered Links
1196 * into CandidateURIs.
1197 *
1198 * Elements can be removed from the returned collection, but not added.
1199 * To add a candidate URI, use {@link #replaceOutlinks(Collection)} or
1200 * {@link #getOutObjects}.
1201 *
1202 * @return Collection of candidate URIs
1203 */
1204 public Collection<CandidateURI> getOutCandidates() {
1205 return Transform.subclasses(outLinks, CandidateURI.class);
1206 }
1207
1208
1209 /***
1210 * Returns all of the outbound objects. The returned Collection will
1211 * contain Link instances, or CandidateURI instances, or both.
1212 *
1213 * @return the collection of Links and/or CandidateURIs
1214 */
1215 public Collection<Object> getOutObjects() {
1216 return outLinks;
1217 }
1218
1219 /***
1220 * Add a discovered Link, unless it would exceed the max number
1221 * to accept. (If so, increment discarded link counter.)
1222 *
1223 * @param link the Link to add
1224 */
1225 public void addOutLink(Link link) {
1226 if (outLinks.size() < MAX_OUTLINKS) {
1227 outLinks.add(link);
1228 } else {
1229
1230 discardedOutlinks++;
1231 }
1232 }
1233
1234 public void clearOutlinks() {
1235 this.outLinks.clear();
1236 }
1237
1238 /***
1239 * Replace current collection of links w/ passed list.
1240 * Used by Scopers adjusting the list of links (removing those
1241 * not in scope and promoting Links to CandidateURIs).
1242 *
1243 * @param a collection of CandidateURIs replacing any previously
1244 * existing outLinks or outCandidates
1245 */
1246 public void replaceOutlinks(Collection<CandidateURI> links) {
1247 clearOutlinks();
1248 this.outLinks.addAll(links);
1249 }
1250
1251
1252 /***
1253 * @return Count of outlinks.
1254 */
1255 public int outlinksSize() {
1256 return this.outLinks.size();
1257 }
1258
1259 /***
1260 * Convenience method for creating a Link discovered at this URI
1261 * with the given string and context
1262 *
1263 * @param url
1264 * String to use to create Link
1265 * @param context
1266 * CharSequence context to use
1267 * @param hopType
1268 * @return Link.
1269 * @throws URIException
1270 * if Link UURI cannot be constructed
1271 */
1272 public Link createLink(String url, CharSequence context,
1273 char hopType) throws URIException {
1274 return new Link(getUURI(), UURIFactory.getInstance(getUURI(),
1275 url), context, hopType);
1276 }
1277
1278 /***
1279 * Convenience method for creating a Link with the given string and
1280 * context
1281 *
1282 * @param url
1283 * String to use to create Link
1284 * @param context
1285 * CharSequence context to use
1286 * @param hopType
1287 * @throws URIException
1288 * if Link UURI cannot be constructed
1289 */
1290 public void createAndAddLink(String url, CharSequence context,
1291 char hopType) throws URIException {
1292 addOutLink(createLink(url, context, hopType));
1293 }
1294
1295 /***
1296 * Convenience method for creating a Link with the given string and
1297 * context, relative to a previously set base HREF if available (or
1298 * relative to the current CrawlURI if no other base has been set)
1299 *
1300 * @param url String URL to add as destination of link
1301 * @param context String context where link was discovered
1302 * @param hopType char hop-type indicator
1303 * @throws URIException
1304 */
1305 public void createAndAddLinkRelativeToBase(String url,
1306 CharSequence context, char hopType) throws URIException {
1307 addOutLink(new Link(getUURI(), UURIFactory.getInstance(
1308 getBaseURI(), url), context, hopType));
1309 }
1310
1311 /***
1312 * Convenience method for creating a Link with the given string and
1313 * context, relative to this CrawlURI's via UURI if available. (If
1314 * a via is not available, falls back to using
1315 * #createAndAddLinkRelativeToBase.)
1316 *
1317 * @param url String URL to add as destination of link
1318 * @param context String context where link was discovered
1319 * @param hopType char hop-type indicator
1320 * @throws URIException
1321 */
1322 public void createAndAddLinkRelativeToVia(String url,
1323 CharSequence context, char hopType) throws URIException {
1324 if(getVia()!=null) {
1325 addOutLink(new Link(getUURI(), UURIFactory.getInstance(
1326 getVia(), url), context, hopType));
1327 } else {
1328
1329 createAndAddLinkRelativeToBase(url,context,hopType);
1330 }
1331 }
1332
1333 /***
1334 * Set the (HTML) Base URI used for derelativizing internal URIs.
1335 *
1336 * @param baseHref String base href to use
1337 * @throws URIException if supplied string cannot be interpreted as URI
1338 */
1339 public void setBaseURI(String baseHref) throws URIException {
1340 putObject(A_HTML_BASE, UURIFactory.getInstance(baseHref));
1341 }
1342
1343 /***
1344 * Get the (HTML) Base URI used for derelativizing internal URIs.
1345 *
1346 * @return UURI base URI previously set
1347 */
1348 public UURI getBaseURI() {
1349 if (!containsKey(A_HTML_BASE)) {
1350 return getUURI();
1351 }
1352 return (UURI)getObject(A_HTML_BASE);
1353 }
1354
1355 /***
1356 * Add the key of alist items you want to persist across
1357 * processings.
1358 * @param key Key to add.
1359 */
1360 public static void addAlistPersistentMember(Object key) {
1361 alistPersistentMember.add(key);
1362 }
1363
1364 /***
1365 * @param key Key to remove.
1366 * @return True if list contained the element.
1367 */
1368 public static boolean removeAlistPersistentMember(Object key) {
1369 return alistPersistentMember.remove(key);
1370 }
1371
1372 /***
1373 * Custom serialization writing an empty 'outLinks' as null. Estimated
1374 * to save ~20 bytes in serialized form.
1375 *
1376 * @param stream
1377 * @throws IOException
1378 */
1379 private void writeObject(ObjectOutputStream stream) throws IOException {
1380 stream.defaultWriteObject();
1381 stream.writeObject((outLinks.isEmpty()) ? null : outLinks);
1382 }
1383
1384 /***
1385 * Custom deserialization recreating empty HashSet from null in 'outLinks'
1386 * slot.
1387 *
1388 * @param stream
1389 * @throws IOException
1390 * @throws ClassNotFoundException
1391 */
1392 private void readObject(ObjectInputStream stream) throws IOException,
1393 ClassNotFoundException {
1394 stream.defaultReadObject();
1395 @SuppressWarnings("unchecked")
1396 Collection<Object> ol = (Collection<Object>) stream.readObject();
1397 outLinks = (ol == null) ? new ArrayList<Object>() : ol;
1398 }
1399
1400 public long getFetchDuration() {
1401 if(! containsKey(A_FETCH_COMPLETED_TIME)) {
1402 return -1;
1403 }
1404
1405 long completedTime = getLong(A_FETCH_COMPLETED_TIME);
1406 long beganTime = getLong(A_FETCH_BEGAN_TIME);
1407 return completedTime - beganTime;
1408 }
1409
1410
1411 }