View Javadoc

1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * CrawlURI.java
20   * Created on Apr 16, 2003
21   *
22   * $Header$
23   */
24  package org.archive.crawler.datamodel;
25  
26  import java.io.IOException;
27  import java.io.ObjectInputStream;
28  import java.io.ObjectOutputStream;
29  import java.util.ArrayList;
30  import java.util.Collection;
31  import java.util.HashSet;
32  import java.util.Iterator;
33  import java.util.List;
34  import java.util.Set;
35  import java.util.concurrent.CopyOnWriteArrayList;
36  
37  import org.apache.commons.httpclient.HttpStatus;
38  import org.apache.commons.httpclient.URIException;
39  import org.archive.crawler.datamodel.credential.CredentialAvatar;
40  import org.archive.crawler.datamodel.credential.Rfc2617Credential;
41  import org.archive.crawler.extractor.Link;
42  import org.archive.crawler.framework.Processor;
43  import org.archive.crawler.framework.ProcessorChain;
44  import org.archive.crawler.util.Transform;
45  import org.archive.net.UURI;
46  import org.archive.net.UURIFactory;
47  import org.archive.util.Base32;
48  import org.archive.util.HttpRecorder;
49  
50  import st.ata.util.AList;
51  import st.ata.util.HashtableAList;
52  
53  
54  /***
55   * Represents a candidate URI and the associated state it
56   * collects as it is crawled.
57   *
58   * <p>Core state is in instance variables but a flexible
59   * attribute list is also available. Use this 'bucket' to carry
60   * custom processing extracted data and state across CrawlURI
61   * processing.  See the {@link #putString(String, String)},
62   * {@link #getString(String)}, etc. 
63   *
64   * @author Gordon Mohr
65   */
66  public class CrawlURI extends CandidateURI
67  implements FetchStatusCodes {
68  
69      private static final long serialVersionUID = 7874096757350100472L;
70  
71      public static final int UNCALCULATED = -1;
72      
73      // INHERITED FROM CANDIDATEURI
74      // uuri: core identity: the "usable URI" to be crawled
75      // isSeed
76      // inScopeVersion
77      // pathFromSeed
78      // via
79  
80      // Processing progress
81      transient private Processor nextProcessor;
82      transient private ProcessorChain nextProcessorChain;
83      private int fetchStatus = 0;    // default to unattempted
84      private int deferrals = 0;     // count of postponements for prerequisites
85      private int fetchAttempts = 0; // the number of fetch attempts that have been made
86      transient private int threadNumber;
87  
88      // dynamic context
89      /*** @deprecated */
90      private int linkHopCount = UNCALCULATED; // from seeds
91      /*** @deprecated */
92      private int embedHopCount = UNCALCULATED; // from a sure link; reset upon any link traversal
93  
94      // User agent to masquerade as when crawling this URI. If null, globals should be used
95      private String userAgent = null;
96  
97      // Once a link extractor has finished processing this curi this will be
98      // set as true
99      transient private boolean linkExtractorFinished = false;
100 
101     /***
102      * Protection against outlink overflow.
103      * Change value by setting alternate maximum in heritrix.properties.
104      */
105     public static final int MAX_OUTLINKS = Integer.
106         parseInt(System.getProperty(CrawlURI.class.getName() + ".maxOutLinks",
107             "6000"));
108     
109     transient private int discardedOutlinks = 0; 
110     
111 ////////////////////////////////////////////////////////////////////
112     private long contentSize = UNCALCULATED;
113     private long contentLength = UNCALCULATED;
114 
115     /***
116      * Current http recorder.
117      *
118      * Gets set upon successful request.  Reset at start of processing chain.
119      */
120     private transient HttpRecorder httpRecorder = null;
121 
122     /***
123      * Content type of a successfully fetched URI.
124      *
125      * May be null even on successfully fetched URI.
126      */
127     private String contentType = null;
128 
129     /***
130      * True if this CrawlURI has been deemed a prerequisite by the
131      * {@link org.archive.crawler.prefetch.PreconditionEnforcer}.
132      *
133      * This flag is used at least inside in the precondition enforcer so that
134      * subsequent prerequisite tests know to let this CrawlURI through because
135      * its a prerequisite needed by an earlier prerequisite tests (e.g. If
136      * this is a robots.txt, then the subsequent login credentials prereq
137      * test must not throw it out because its not a login curi).
138      */
139     private boolean prerequisite = false;
140 
141     /***
142      * Set to true if this <code>curi</code> is to be POST'd rather than GET-d.
143      */
144     private boolean post = false;
145 
146     /*** 
147      * Monotonically increasing number within a crawl;
148      * useful for tending towards breadth-first ordering.
149      * Will sometimes be truncated to 48 bits, so behavior
150      * over 281 trillion instantiated CrawlURIs may be 
151      * buggy
152      */
153     protected long ordinal;
154 
155     /***
156      * Cache of this candidate uuri as a string.
157      *
158      * Profiling shows us spending about 1-2% of total elapsed time in
159      * toString.
160      */
161     private String cachedCrawlURIString = null;
162     
163     /***
164      * Array to hold keys of alist members that persist across URI processings.
165      * Any key mentioned in this list will not be cleared out at the end
166      * of a pass down the processing chain.
167      */
168     private static final List<Object> alistPersistentMember
169      = new CopyOnWriteArrayList<Object>(
170             new String [] {A_CREDENTIAL_AVATARS_KEY});
171 
172     /***
173      * A digest (hash, usually SHA1) of retrieved content-body. 
174      * 
175      */
176     private byte[] contentDigest = null;
177     private String contentDigestScheme = null;
178 
179 
180     /***
181      * Create a new instance of CrawlURI from a {@link UURI}.
182      *
183      * @param uuri the UURI to base this CrawlURI on.
184      */
185     public CrawlURI(UURI uuri) {
186         super(uuri);
187     }
188 
189     /***
190      * Create a new instance of CrawlURI from a {@link CandidateURI}
191      *
192      * @param caUri the CandidateURI to base this CrawlURI on.
193      * @param o Monotonically increasing number within a crawl.
194      */
195     @SuppressWarnings("deprecation")
196     public CrawlURI(CandidateURI caUri, long o) {
197         super(caUri.getUURI(), caUri.getPathFromSeed(), caUri.getVia(),
198             caUri.getViaContext());
199         ordinal = o;
200         setIsSeed(caUri.isSeed());
201         setSchedulingDirective(caUri.getSchedulingDirective());
202         setAList(caUri.getAList());
203     }
204 
205     /***
206      * Takes a status code and converts it into a human readable string.
207      *
208      * @param code the status code
209      * @return a human readable string declaring what the status code is.
210      */
211     public static String fetchStatusCodesToString(int code){
212         switch(code){
213             // DNS
214             case S_DNS_SUCCESS : return "DNS-1-OK";
215             // HTTP Informational 1xx
216             case 100  : return "HTTP-100-Info-Continue";
217             case 101  : return "HTTP-101-Info-Switching Protocols";
218             // HTTP Successful 2xx
219             case 200  : return "HTTP-200-Success-OK";
220             case 201  : return "HTTP-201-Success-Created";
221             case 202  : return "HTTP-202-Success-Accepted";
222             case 203  : return "HTTP-203-Success-Non-Authoritative";
223             case 204  : return "HTTP-204-Success-No Content ";
224             case 205  : return "HTTP-205-Success-Reset Content";
225             case 206  : return "HTTP-206-Success-Partial Content";
226             // HTTP Redirection 3xx
227             case 300  : return "HTTP-300-Redirect-Multiple Choices";
228             case 301  : return "HTTP-301-Redirect-Moved Permanently";
229             case 302  : return "HTTP-302-Redirect-Found";
230             case 303  : return "HTTP-303-Redirect-See Other";
231             case 304  : return "HTTP-304-Redirect-Not Modified";
232             case 305  : return "HTTP-305-Redirect-Use Proxy";
233             case 307  : return "HTTP-307-Redirect-Temporary Redirect";
234             // HTTP Client Error 4xx
235             case 400  : return "HTTP-400-ClientErr-Bad Request";
236             case 401  : return "HTTP-401-ClientErr-Unauthorized";
237             case 402  : return "HTTP-402-ClientErr-Payment Required";
238             case 403  : return "HTTP-403-ClientErr-Forbidden";
239             case 404  : return "HTTP-404-ClientErr-Not Found";
240             case 405  : return "HTTP-405-ClientErr-Method Not Allowed";
241             case 407  : return "HTTP-406-ClientErr-Not Acceptable";
242             case 408  : return "HTTP-407-ClientErr-Proxy Authentication Required";
243             case 409  : return "HTTP-408-ClientErr-Request Timeout";
244             case 410  : return "HTTP-409-ClientErr-Conflict";
245             case 406  : return "HTTP-410-ClientErr-Gone";
246             case 411  : return "HTTP-411-ClientErr-Length Required";
247             case 412  : return "HTTP-412-ClientErr-Precondition Failed";
248             case 413  : return "HTTP-413-ClientErr-Request Entity Too Large";
249             case 414  : return "HTTP-414-ClientErr-Request-URI Too Long";
250             case 415  : return "HTTP-415-ClientErr-Unsupported Media Type";
251             case 416  : return "HTTP-416-ClientErr-Requested Range Not Satisfiable";
252             case 417  : return "HTTP-417-ClientErr-Expectation Failed";
253             // HTTP Server Error 5xx
254             case 500  : return "HTTP-500-ServerErr-Internal Server Error";
255             case 501  : return "HTTP-501-ServerErr-Not Implemented";
256             case 502  : return "HTTP-502-ServerErr-Bad Gateway";
257             case 503  : return "HTTP-503-ServerErr-Service Unavailable";
258             case 504  : return "HTTP-504-ServerErr-Gateway Timeout";
259             case 505  : return "HTTP-505-ServerErr-HTTP Version Not Supported";
260             // Heritrix internal codes (all negative numbers
261             case S_BLOCKED_BY_USER:
262                 return "Heritrix(" + S_BLOCKED_BY_USER + ")-Blocked by user";
263             case S_BLOCKED_BY_CUSTOM_PROCESSOR:
264                 return "Heritrix(" + S_BLOCKED_BY_CUSTOM_PROCESSOR +
265                 ")-Blocked by custom prefetch processor";
266             case S_DELETED_BY_USER:
267                 return "Heritrix(" + S_DELETED_BY_USER + ")-Deleted by user";
268             case S_CONNECT_FAILED:
269                 return "Heritrix(" + S_CONNECT_FAILED + ")-Connection failed";
270             case S_CONNECT_LOST:
271                 return "Heritrix(" + S_CONNECT_LOST + ")-Connection lost";
272             case S_DEEMED_CHAFF:
273                 return "Heritrix(" + S_DEEMED_CHAFF + ")-Deemed chaff";
274             case S_DEFERRED:
275                 return "Heritrix(" + S_DEFERRED + ")-Deferred";
276             case S_DOMAIN_UNRESOLVABLE:
277                 return "Heritrix(" + S_DOMAIN_UNRESOLVABLE
278                         + ")-Domain unresolvable";
279             case S_OUT_OF_SCOPE:
280                 return "Heritrix(" + S_OUT_OF_SCOPE + ")-Out of scope";
281             case S_DOMAIN_PREREQUISITE_FAILURE:
282                 return "Heritrix(" + S_DOMAIN_PREREQUISITE_FAILURE
283                         + ")-Domain prerequisite failure";
284             case S_ROBOTS_PREREQUISITE_FAILURE:
285                 return "Heritrix(" + S_ROBOTS_PREREQUISITE_FAILURE
286                         + ")-Robots prerequisite failure";
287             case S_OTHER_PREREQUISITE_FAILURE:
288                 return "Heritrix(" + S_OTHER_PREREQUISITE_FAILURE
289                         + ")-Other prerequisite failure";
290             case S_PREREQUISITE_UNSCHEDULABLE_FAILURE:
291                 return "Heritrix(" + S_PREREQUISITE_UNSCHEDULABLE_FAILURE
292                         + ")-Prerequisite unschedulable failure";
293             case S_ROBOTS_PRECLUDED:
294                 return "Heritrix(" + S_ROBOTS_PRECLUDED + ")-Robots precluded";
295             case S_RUNTIME_EXCEPTION:
296                 return "Heritrix(" + S_RUNTIME_EXCEPTION
297                         + ")-Runtime exception";
298             case S_SERIOUS_ERROR:
299                 return "Heritrix(" + S_SERIOUS_ERROR + ")-Serious error";
300             case S_TIMEOUT:
301                 return "Heritrix(" + S_TIMEOUT + ")-Timeout";
302             case S_TOO_MANY_EMBED_HOPS:
303                 return "Heritrix(" + S_TOO_MANY_EMBED_HOPS
304                         + ")-Too many embed hops";
305             case S_TOO_MANY_LINK_HOPS:
306                 return "Heritrix(" + S_TOO_MANY_LINK_HOPS
307                         + ")-Too many link hops";
308             case S_TOO_MANY_RETRIES:
309                 return "Heritrix(" + S_TOO_MANY_RETRIES + ")-Too many retries";
310             case S_UNATTEMPTED:
311                 return "Heritrix(" + S_UNATTEMPTED + ")-Unattempted";
312             case S_UNFETCHABLE_URI:
313                 return "Heritrix(" + S_UNFETCHABLE_URI + ")-Unfetchable URI";
314             case S_PROCESSING_THREAD_KILLED:
315                 return "Heritrix(" + S_PROCESSING_THREAD_KILLED + ")-" +
316                     "Processing thread killed";
317             // Unknown return code
318             default : return Integer.toString(code);
319         }
320     }
321 
322 
323     /***
324      * Return the overall/fetch status of this CrawlURI for its
325      * current trip through the processing loop.
326      *
327      * @return a value from FetchStatusCodes
328      */
329     public int getFetchStatus(){
330         return fetchStatus;
331     }
332 
333     /***
334      * Set the overall/fetch status of this CrawlURI for
335      * its current trip through the processing loop.
336      *
337      * @param newstatus a value from FetchStatusCodes
338      */
339     public void setFetchStatus(int newstatus){
340         fetchStatus = newstatus;
341     }
342 
343     /***
344      * Get the number of attempts at getting the document referenced by this
345      * URI.
346      *
347      * @return the number of attempts at getting the document referenced by this
348      *         URI.
349      */
350     public int getFetchAttempts() {
351         return fetchAttempts;
352     }
353 
354     /***
355      * Increment the number of attempts at getting the document referenced by
356      * this URI.
357      *
358      * @return the number of attempts at getting the document referenced by this
359      *         URI.
360      */
361     public int incrementFetchAttempts() {
362         // TODO: rename, this is actually processing-loop-attempts
363         return fetchAttempts++;
364     }
365 
366     /***
367      * Reset fetchAttempts counter.
368      */
369     public void resetFetchAttempts() {
370         this.fetchAttempts = 0;
371     }
372 
373     /***
374      * Reset deferrals counter.
375      */
376     public void resetDeferrals() {
377         this.deferrals = 0;
378     }
379 
380     /***
381      * Get the next processor to process this URI.
382      *
383      * @return the processor that should process this URI next.
384      */
385     public Processor nextProcessor() {
386         return nextProcessor;
387     }
388 
389     /***
390      * Get the processor chain that should be processing this URI after the
391      * current chain is finished with it.
392      *
393      * @return the next processor chain to process this URI.
394      */
395     public ProcessorChain nextProcessorChain() {
396         return nextProcessorChain;
397     }
398 
399     /***
400      * Set the next processor to process this URI.
401      *
402      * @param processor the next processor to process this URI.
403      */
404     public void setNextProcessor(Processor processor) {
405         nextProcessor = processor;
406     }
407 
408     /***
409      * Set the next processor chain to process this URI.
410      *
411      * @param nextProcessorChain the next processor chain to process this URI.
412      */
413     public void setNextProcessorChain(ProcessorChain nextProcessorChain) {
414         this.nextProcessorChain = nextProcessorChain;
415     }
416 
417     /***
418      * Do all actions associated with setting a <code>CrawlURI</code> as
419      * requiring a prerequisite.
420      *
421      * @param lastProcessorChain Last processor chain reference.  This chain is
422      * where this <code>CrawlURI</code> goes next.
423      * @param preq Object to set a prerequisite.
424      * @throws URIException
425      */
426     public void markPrerequisite(String preq,
427             ProcessorChain lastProcessorChain) throws URIException {
428         Link link = createLink(preq,Link.PREREQ_MISC,Link.PREREQ_HOP);
429         setPrerequisiteUri(link);
430         incrementDeferrals();
431         setFetchStatus(S_DEFERRED);
432         skipToProcessorChain(lastProcessorChain);
433     }
434 
435     /***
436      * Set a prerequisite for this URI.
437      * <p>
438      * A prerequisite is a URI that must be crawled before this URI can be
439      * crawled.
440      *
441      * @param link Link to set as prereq.
442      */
443     public void setPrerequisiteUri(Object link) {
444         putObject(A_PREREQUISITE_URI, link);
445     }
446 
447     /***
448      * Get the prerequisite for this URI.
449      * <p>
450      * A prerequisite is a URI that must be crawled before this URI can be
451      * crawled.
452      *
453      * @return the prerequisite for this URI or null if no prerequisite.
454      */
455     public Object getPrerequisiteUri() {
456         return getObject(A_PREREQUISITE_URI);
457     }
458     
459     /***
460      * @return True if this CrawlURI has a prerequisite.
461      */
462     public boolean hasPrerequisiteUri() {
463         return containsKey(A_PREREQUISITE_URI);
464     }
465 
466     /***
467      * Returns true if this CrawlURI is a prerequisite.
468      *
469      * @return true if this CrawlURI is a prerequisite.
470      */
471     public boolean isPrerequisite() {
472         return this.prerequisite;
473     }
474 
475     /***
476      * Set if this CrawlURI is itself a prerequisite URI.
477      *
478      * @param prerequisite True if this CrawlURI is itself a prerequiste uri.
479      */
480     public void setPrerequisite(boolean prerequisite) {
481         this.prerequisite = prerequisite;
482     }
483 
484     /***
485      * @return This crawl URI as a string wrapped with 'CrawlURI(' +
486      * ')'.
487      */
488     public String getCrawlURIString() {
489         if (this.cachedCrawlURIString == null) {
490             synchronized (this) {
491                 if (this.cachedCrawlURIString == null) {
492                     this.cachedCrawlURIString =
493                         "CrawlURI(" + toString() + ")";
494                 }
495             }
496         }
497         return this.cachedCrawlURIString;
498     }
499 
500     /***
501      * Get the content type of this URI.
502      *
503      * @return Fetched URIs content type.  May be null.
504      */
505     public String getContentType() {
506         return this.contentType;
507     }
508 
509     /***
510      * Set a fetched uri's content type.
511      *
512      * @param ct Contenttype.  May be null.
513      */
514     public void setContentType(String ct) {
515         this.contentType = ct;
516     }
517 
518     /***
519      * Set the number of the ToeThread responsible for processing this uri.
520      *
521      * @param i the ToeThread number.
522      */
523     public void setThreadNumber(int i) {
524         threadNumber = i;
525     }
526 
527     /***
528      * Get the number of the ToeThread responsible for processing this uri.
529      *
530      * @return the ToeThread number.
531      */
532     public int getThreadNumber() {
533         return threadNumber;
534     }
535 
536     /***
537      * Increment the deferral count.
538      *
539      */
540     public void incrementDeferrals() {
541         deferrals++;
542     }
543 
544     /***
545      * Get the deferral count.
546      *
547      * @return the deferral count.
548      */
549     public int getDeferrals() {
550         return deferrals;
551     }
552 
553     /***
554      * Remove all attributes set on this uri.
555      * <p>
556      * This methods removes the attribute list.
557      */
558     public void stripToMinimal() {
559         clearAList();
560     }
561 
562     /*** 
563      * Get the size in bytes of this URI's recorded content, inclusive
564      * of things like protocol headers. It is the responsibility of the 
565      * classes which fetch the URI to set this value accordingly -- it is 
566      * not calculated/verified within CrawlURI. 
567      * 
568      * This value is consulted in reporting/logging/writing-decisions.
569      * 
570      * @see #setContentSize()
571      * @return contentSize
572      */
573     public long getContentSize(){
574         return contentSize;
575     }
576 
577     /***
578      * Make note of a non-fatal error, local to a particular Processor,
579      * which should be logged somewhere, but allows processing to continue.
580      *
581      * This is how you add to the local-error log (the 'localized' in
582      * the below is making an error local rather than global, not
583      * making a swiss-french version of the error.).
584      * 
585      * @param processorName Name of processor the exception was thrown
586      * in.
587      * @param ex Throwable to log.
588      * @param message Extra message to log beyond exception message.
589      */
590     public void addLocalizedError(final String processorName,
591             final Throwable ex, final String message) {
592         List<LocalizedError> localizedErrors;
593         if (containsKey(A_LOCALIZED_ERRORS)) {
594             @SuppressWarnings("unchecked")
595             List<LocalizedError> temp // to prevent warning on cast
596              = (List<LocalizedError>) getObject(A_LOCALIZED_ERRORS);
597             localizedErrors = temp;
598         } else {
599             localizedErrors = new ArrayList<LocalizedError>();
600             putObject(A_LOCALIZED_ERRORS, localizedErrors);
601         }
602 
603         localizedErrors.add(new LocalizedError(processorName, ex, message));
604         addAnnotation("le:" + getClassSimpleName(ex.getClass()) + "@" +
605             processorName);
606     }
607     
608     // TODO: Move to utils.
609     protected String getClassSimpleName(final Class c) {
610         String classname = c.getName();
611         int index = classname.lastIndexOf('.');
612         return ((index > 0 && (index + 1) < classname.length())?
613             classname.substring(index + 1): classname);
614     }
615 
616     /***
617      * Add an annotation: an abbrieviated indication of something special
618      * about this URI that need not be present in every crawl.log line,
619      * but should be noted for future reference. 
620      *
621      * @param annotation the annotation to add; should not contain 
622      * whitespace or a comma
623      */
624     public void addAnnotation(String annotation) {
625         String annotations;
626         if(containsKey(A_ANNOTATIONS)) {
627             annotations = getString(A_ANNOTATIONS);
628             annotations += ","+annotation;
629         } else {
630             annotations = annotation;
631         }
632 
633         putString(A_ANNOTATIONS,annotations);
634     }
635     
636     /***
637      * TODO: Implement truncation using booleans rather than as this
638      * ugly String parse.
639      * @return True if fetch was truncated.
640      */
641     public boolean isTruncatedFetch() {
642         return annotationContains(TRUNC_SUFFIX);
643     }
644     
645     public boolean isLengthTruncatedFetch() {
646         return annotationContains(LENGTH_TRUNC);
647     }
648     
649     public boolean isTimeTruncatedFetch() {
650         return annotationContains(TIMER_TRUNC);
651     }
652     
653     public boolean isHeaderTruncatedFetch() {
654         return annotationContains(HEADER_TRUNC);
655     }
656     
657     protected boolean annotationContains(final String str2Find) {
658         boolean result = false;
659         if (!containsKey(A_ANNOTATIONS)) {
660             return result;
661         }
662         String annotations = getString(A_ANNOTATIONS);
663         if (annotations != null && annotations.length() > 0) {
664             result = annotations.indexOf(str2Find) >= 0;
665         }
666         return result;
667     }
668 
669     /***
670      * Get the annotations set for this uri.
671      *
672      * @return the annotations set for this uri.
673      */
674     public String getAnnotations() {
675         return (containsKey(A_ANNOTATIONS))?
676             getString(A_ANNOTATIONS): null;
677     }
678 
679     /***
680      * Get the embeded hop count.
681      *
682      * @return the embeded hop count.
683      * @deprecated 
684      */
685     public int getEmbedHopCount() {
686         return embedHopCount;
687     }
688 
689     /***
690      * Get the link hop count.
691      *
692      * @return the link hop count.
693      * @deprecated 
694      */
695     public int getLinkHopCount() {
696         return linkHopCount;
697     }
698 
699     /***
700      * Mark this uri as being a seed.
701      *
702      *
703      * @deprecated 
704      */
705     public void markAsSeed() {
706         linkHopCount = 0;
707         embedHopCount = 0;
708     }
709 
710     /***
711      * Get the user agent to use for crawling this URI.
712      *
713      * If null the global setting should be used.
714      *
715      * @return user agent or null
716      */
717     public String getUserAgent() {
718         return userAgent;
719     }
720 
721     /***
722      * Set the user agent to use when crawling this URI.
723      *
724      * If not set the global settings should be used.
725      *
726      * @param string user agent to use
727      */
728     public void setUserAgent(String string) {
729         userAgent = string;
730     }
731 
732     /***
733      * Set which processor should be the next processor to process this uri
734      * instead of using the default next processor.
735      *
736      * @param processorChain the processor chain to skip to.
737      * @param processor the processor in the processor chain to skip to.
738      */
739     public void skipToProcessor(ProcessorChain processorChain,
740             Processor processor) {
741         setNextProcessorChain(processorChain);
742         setNextProcessor(processor);
743     }
744 
745     /***
746      * Set which processor chain should be processing this uri next.
747      *
748      * @param processorChain the processor chain to skip to.
749      */
750     public void skipToProcessorChain(ProcessorChain processorChain) {
751         setNextProcessorChain(processorChain);
752         setNextProcessor(null);
753     }
754 
755     /***
756      * For completed HTTP transactions, the length of the content-body.
757      *
758      * @return For completed HTTP transactions, the length of the content-body.
759      */
760     public long getContentLength() {
761         if (this.contentLength < 0) {
762             this.contentLength = (getHttpRecorder() != null)?
763                 getHttpRecorder().getResponseContentLength(): 0;
764         }
765         return this.contentLength;
766     }
767     
768     /***
769      * Get size of data recorded (transferred)
770      *
771      * @return recorded data size
772      */
773     public long getRecordedSize() {
774         return (getHttpRecorder() != null)
775                     ?  getHttpRecorder().getRecordedInput().getSize()
776                     // if unavailable fall back on content-size
777                     : getContentSize(); 
778     }
779 
780     /***
781      * Sets the 'content size' for the URI, which is considered inclusive
782      * of all recorded material (such as protocol headers) or even material
783      * 'virtually' considered (as in material from a previous fetch 
784      * confirmed unchanged with a server). (In contrast, content-length 
785      * matches the HTTP definition, that of the enclosed content-body.)
786      * 
787      * Should be set by a fetcher or other processor as soon as the final 
788      * size of recorded content is known. Setting to an artificial/incorrect
789      * value may affect other reporting/processing. 
790      * 
791      * @param l Content size.
792      */
793     public void setContentSize(long l) {
794         contentSize = l;
795     }
796 
797     /***
798      * If true then a link extractor has already claimed this CrawlURI and
799      * performed link extraction on the document content. This does not
800      * preclude other link extractors that may have an interest in this
801      * CrawlURI from also doing link extraction.
802      * 
803      * <p>There is an onus on link extractors to set this flag if they have
804      * run.
805      * 
806      * @return True if a processor has performed link extraction on this
807      * CrawlURI
808      *
809      * @see #linkExtractorFinished()
810      */
811     public boolean hasBeenLinkExtracted(){
812         return linkExtractorFinished;
813     }
814 
815     /***
816      * Note that link extraction has been performed on this CrawlURI. A processor
817      * doing link extraction should invoke this method once it has finished it's
818      * work. It should invoke it even if no links are extracted. It should only
819      * invoke this method if the link extraction was performed on the document
820      * body (not the HTTP headers etc.).
821      *
822      * @see #hasBeenLinkExtracted()
823      */
824     public void linkExtractorFinished() {
825         linkExtractorFinished = true;
826         if(discardedOutlinks>0) {
827             addAnnotation("dol:"+discardedOutlinks);
828         }
829     }
830 
831     /***
832      * Notify CrawlURI it is about to be logged; opportunity
833      * for self-annotation
834      */
835     public void aboutToLog() {
836         if (fetchAttempts>1) {
837             addAnnotation(fetchAttempts+"t");
838         }
839     }
840 
841     /***
842      * Get the http recorder associated with this uri.
843      *
844      * @return Returns the httpRecorder.  May be null but its set early in
845      * FetchHttp so there is an issue if its null.
846      */
847     public HttpRecorder getHttpRecorder() {
848         return httpRecorder;
849     }
850 
851     /***
852      * Set the http recorder to be associated with this uri.
853      *
854      * @param httpRecorder The httpRecorder to set.
855      */
856     public void setHttpRecorder(HttpRecorder httpRecorder) {
857         this.httpRecorder = httpRecorder;
858     }
859 
860     /***
861      * Return true if this is a http transaction.
862      *
863      * TODO: Compound this and {@link #isPost()} method so that there is one
864      * place to go to find out if get http, post http, ftp, dns.
865      *
866      * @return True if this is a http transaction.
867      */
868     public boolean isHttpTransaction() {
869         return containsKey(A_HTTP_TRANSACTION);
870     }
871 
872     /***
873      * Clean up after a run through the processing chain.
874      *
875      * Called on the end of processing chain by Frontier#finish.  Null out any
876      * state gathered during processing.
877      */
878     public void processingCleanup() {
879         this.httpRecorder = null;
880         this.fetchStatus = S_UNATTEMPTED;
881         this.setPrerequisite(false);
882         this.contentSize = UNCALCULATED;
883         this.contentLength = UNCALCULATED;
884         // Clear 'links extracted' flag.
885         this.linkExtractorFinished = false;
886         // Clean the alist of all but registered permanent members.
887         setAList(getPersistentAList());
888     }
889     
890     public AList getPersistentAList() {
891         AList newAList = new HashtableAList();
892         // copy declared persistent keys
893         if(alistPersistentMember!=null && alistPersistentMember.size() > 0) {
894             newAList.copyKeysFrom(alistPersistentMember.iterator(), getAList());
895         } 
896         // also copy declared 'heritable' keys
897         List heritableKeys = (List) getObject(A_HERITABLE_KEYS);
898         if(heritableKeys!=null) {
899             newAList.copyKeysFrom(heritableKeys.iterator(), getAList());
900         }
901         return newAList;
902     }
903 
904     /***
905      * Make a <code>CrawlURI</code> from the passed <code>CandidateURI</code>.
906      *
907      * Its safe to pass a CrawlURI instance.  In this case we just return it
908      * as a result. Otherwise, we create new CrawlURI instance.
909      *
910      * @param caUri Candidate URI.
911      * @param ordinal
912      * @return A crawlURI made from the passed CandidateURI.
913      */
914     public static CrawlURI from(CandidateURI caUri, long ordinal) {
915         return (caUri instanceof CrawlURI)?
916             (CrawlURI)caUri: new CrawlURI(caUri, ordinal);
917     }
918 
919     /***
920      * @param avatars Credential avatars to save off.
921      */
922     private void setCredentialAvatars(Set avatars) {
923         putObject(A_CREDENTIAL_AVATARS_KEY, avatars);
924     }
925 
926     /***
927      * @return Credential avatars.  Null if none set.
928      */
929     @SuppressWarnings("unchecked")
930     public Set<CredentialAvatar> getCredentialAvatars() {
931         return (Set)getObject(A_CREDENTIAL_AVATARS_KEY);
932     }
933 
934     /***
935      * @return True if there are avatars attached to this instance.
936      */
937     public boolean hasCredentialAvatars() {
938         return getCredentialAvatars() != null &&
939             getCredentialAvatars().size() > 0;
940     }
941 
942     /***
943      * Add an avatar.
944      *
945      * We do lazy instantiation.
946      *
947      * @param ca Credential avatar to add to set of avatars.
948      */
949     public void addCredentialAvatar(CredentialAvatar ca) {
950         Set<CredentialAvatar> avatars = getCredentialAvatars();
951         if (avatars == null) {
952             avatars = new HashSet<CredentialAvatar>();
953             setCredentialAvatars(avatars);
954         }
955         avatars.add(ca);
956     }
957 
958     /***
959      * Remove all credential avatars from this crawl uri.
960      */
961     public void removeCredentialAvatars() {
962         if (hasCredentialAvatars()) {
963             remove(A_CREDENTIAL_AVATARS_KEY);
964         }
965     }
966 
967     /***
968      * Remove all credential avatars from this crawl uri.
969      * @param ca Avatar to remove.
970      * @return True if we removed passed parameter.  False if no operation
971      * performed.
972      */
973     public boolean removeCredentialAvatar(CredentialAvatar ca) {
974         boolean result = false;
975         Set avatars = getCredentialAvatars();
976         if (avatars != null && avatars.size() > 0) {
977             result = avatars.remove(ca);
978         }
979         return result;
980     }
981 
982     /***
983      * Ask this URI if it was a success or not.
984      *
985      * Only makes sense to call this method after execution of
986      * HttpMethod#execute. Regard any status larger then 0 as success
987      * except for below caveat regarding 401s.  Use {@link #is2XXSuccess()} if
988      * looking for a status code in the 200 range.
989      *
990      * <p>401s caveat: If any rfc2617 credential data present and we got a 401
991      * assume it got loaded in FetchHTTP on expectation that we're to go around
992      * the processing chain again. Report this condition as a failure so we
993      * get another crack at the processing chain only this time we'll be making
994      * use of the loaded credential data.
995      *
996      * @return True if ths URI has been successfully processed.
997      * @see #is2XXSuccess()
998      */
999     public boolean isSuccess() {
1000         boolean result = false;
1001         int statusCode = this.fetchStatus;
1002         if (statusCode == HttpStatus.SC_UNAUTHORIZED &&
1003             hasRfc2617CredentialAvatar()) {
1004             result = false;
1005         } else {
1006             result = (statusCode > 0);
1007         }
1008         return result;
1009     }
1010     
1011     /***
1012      * @return True if status code is in the 2xx range.
1013      * @see #isSuccess()
1014      */
1015     public boolean is2XXSuccess() {
1016     	return this.fetchStatus >= 200 && this.fetchStatus < 300;
1017     }
1018 
1019     /***
1020 	 * @return True if we have an rfc2617 payload.
1021 	 */
1022 	public boolean hasRfc2617CredentialAvatar() {
1023 	    boolean result = false;
1024 	    Set avatars = getCredentialAvatars();
1025 	    if (avatars != null && avatars.size() > 0) {
1026 	        for (Iterator i = avatars.iterator(); i.hasNext();) {
1027 	            if (((CredentialAvatar)i.next()).
1028 	                match(Rfc2617Credential.class)) {
1029 	                result = true;
1030 	                break;
1031 	            }
1032 	        }
1033 	    }
1034         return result;
1035 	}
1036 
1037     /***
1038      * Set whether this URI should be fetched by sending a HTTP POST request.
1039      * Else a HTTP GET request will be used.
1040      *
1041      * @param b Set whether this curi is to be POST'd.  Else its to be GET'd.
1042      */
1043     public void setPost(boolean b) {
1044         this.post = b;
1045     }
1046 
1047     /***
1048      * Returns true if this URI should be fetched by sending a HTTP POST request.
1049      *
1050      *
1051      * TODO: Compound this and {@link #isHttpTransaction()} method so that there
1052      * is one place to go to find out if get http, post http, ftp, dns.
1053      *
1054      * @return Returns is this CrawlURI instance is to be posted.
1055      */
1056     public boolean isPost() {
1057         return this.post;
1058     }
1059 
1060     /***
1061      * Set the retained content-digest value (usu. SHA1). 
1062      * 
1063      * @param digestValue
1064      * @deprecated Use {@link #setContentDigest(String scheme, byte[])}
1065      */
1066     public void setContentDigest(byte[] digestValue) {
1067         setContentDigest("SHA1", digestValue);
1068     }
1069     
1070     public void setContentDigest(final String scheme,
1071             final byte [] digestValue) {
1072         this.contentDigest = digestValue;
1073         this.contentDigestScheme = scheme;
1074     }
1075     
1076     public String getContentDigestSchemeString() {
1077         if(this.contentDigest==null) {
1078             return null;
1079         }
1080         return this.contentDigestScheme + ":" + getContentDigestString();
1081     }
1082 
1083     /***
1084      * Return the retained content-digest value, if any.
1085      * 
1086      * @return Digest value.
1087      */
1088     public Object getContentDigest() {
1089         return contentDigest;
1090     }
1091     
1092     public String getContentDigestString() {
1093         if(this.contentDigest==null) {
1094             return null;
1095         }
1096         return Base32.encode(this.contentDigest);
1097     }
1098 
1099     transient Object holder;
1100     transient Object holderKey;
1101 
1102     /***
1103      * Remember a 'holder' to which some enclosing/queueing
1104      * facility has assigned this CrawlURI
1105      * .
1106      * @param obj
1107      */
1108     public void setHolder(Object obj) {
1109         holder=obj;
1110     }
1111 
1112     /***
1113      * Return the 'holder' for the convenience of 
1114      * an external facility.
1115      *
1116      * @return holder
1117      */
1118     public Object getHolder() {
1119         return holder;
1120     }
1121 
1122     /***
1123      * Remember a 'holderKey' which some enclosing/queueing
1124      * facility has assigned this CrawlURI
1125      * .
1126      * @param obj
1127      */
1128     public void setHolderKey(Object obj) {
1129         holderKey=obj;
1130     }
1131     /***
1132      * Return the 'holderKey' for convenience of 
1133      * an external facility (Frontier).
1134      * 
1135      * @return holderKey 
1136      */
1137     public Object getHolderKey() {
1138         return holderKey;
1139     }
1140 
1141     /***
1142      * Get the ordinal (serial number) assigned at creation.
1143      * 
1144      * @return ordinal
1145      */
1146     public long getOrdinal() {
1147         return ordinal;
1148     }
1149 
1150     /*** spot for an integer cost to be placed by external facility (frontier).
1151      *  cost is truncated to 8 bits at times, so should not exceed 255 */
1152     int holderCost = UNCALCULATED;
1153     /***
1154      * Return the 'holderCost' for convenience of external facility (frontier)
1155      * @return value of holderCost
1156      */
1157     public int getHolderCost() {
1158         return holderCost;
1159     }
1160 
1161     /***
1162      * Remember a 'holderCost' which some enclosing/queueing
1163      * facility has assigned this CrawlURI
1164      * @param cost value to remember
1165      */
1166     public void setHolderCost(int cost) {
1167         holderCost = cost;
1168     }
1169 
1170     /*** 
1171      * All discovered outbound Links (navlinks, embeds, etc.) 
1172      * Can either contain Link instances or CandidateURI instances, or both.
1173      * The LinksScoper processor converts Link instances in this collection
1174      * to CandidateURI instances. 
1175      */
1176     transient Collection<Object> outLinks = new ArrayList<Object>();
1177     
1178     /***
1179      * Returns discovered links.  The returned collection might be empty if
1180      * no links were discovered, or if something like LinksScoper promoted
1181      * the links to CandidateURIs.
1182      * 
1183      * Elements can be removed from the returned collection, but not added.
1184      * To add a discovered link, use one of the createAndAdd methods or
1185      * {@link #getOutObjects()}.
1186      * 
1187      * @return Collection of all discovered outbound Links
1188      */
1189     public Collection<Link> getOutLinks() {
1190         return Transform.subclasses(outLinks, Link.class);
1191     }
1192     
1193     /***
1194      * Returns discovered candidate URIs.  The returned collection will be
1195      * emtpy until something like LinksScoper promotes discovered Links
1196      * into CandidateURIs.
1197      * 
1198      * Elements can be removed from the returned collection, but not added.
1199      * To add a candidate URI, use {@link #replaceOutlinks(Collection)} or
1200      * {@link #getOutObjects}.
1201      * 
1202      * @return  Collection of candidate URIs
1203      */
1204     public Collection<CandidateURI> getOutCandidates() {
1205         return Transform.subclasses(outLinks, CandidateURI.class);
1206     }
1207     
1208     
1209     /***
1210      * Returns all of the outbound objects.  The returned Collection will
1211      * contain Link instances, or CandidateURI instances, or both.  
1212      * 
1213      * @return  the collection of Links and/or CandidateURIs
1214      */
1215     public Collection<Object> getOutObjects() {
1216         return outLinks;
1217     }
1218     
1219     /***
1220      * Add a discovered Link, unless it would exceed the max number
1221      * to accept. (If so, increment discarded link counter.) 
1222      * 
1223      * @param link the Link to add
1224      */
1225     public void addOutLink(Link link) {
1226         if (outLinks.size() < MAX_OUTLINKS) {
1227             outLinks.add(link);
1228         } else {
1229             // note & discard
1230             discardedOutlinks++;
1231         }
1232     }
1233     
1234     public void clearOutlinks() {
1235         this.outLinks.clear();
1236     }
1237     
1238     /***
1239      * Replace current collection of links w/ passed list.
1240      * Used by Scopers adjusting the list of links (removing those
1241      * not in scope and promoting Links to CandidateURIs).
1242      * 
1243      * @param a collection of CandidateURIs replacing any previously
1244      *   existing outLinks or outCandidates
1245      */
1246     public void replaceOutlinks(Collection<CandidateURI> links) {
1247         clearOutlinks();
1248         this.outLinks.addAll(links);
1249     }
1250     
1251     
1252     /***
1253      * @return Count of outlinks.
1254      */
1255     public int outlinksSize() {
1256         return this.outLinks.size();
1257     }
1258 
1259     /***
1260      * Convenience method for creating a Link discovered at this URI
1261      * with the given string and context
1262      * 
1263      * @param url
1264      *            String to use to create Link
1265      * @param context
1266      *            CharSequence context to use
1267      * @param hopType
1268      * @return Link.
1269      * @throws URIException
1270      *             if Link UURI cannot be constructed
1271      */
1272     public Link createLink(String url, CharSequence context,
1273             char hopType) throws URIException {
1274         return new Link(getUURI(), UURIFactory.getInstance(getUURI(),
1275                 url), context, hopType);
1276     }
1277     
1278     /***
1279      * Convenience method for creating a Link with the given string and
1280      * context
1281      * 
1282      * @param url
1283      *            String to use to create Link
1284      * @param context
1285      *            CharSequence context to use
1286      * @param hopType
1287      * @throws URIException
1288      *             if Link UURI cannot be constructed
1289      */
1290     public void createAndAddLink(String url, CharSequence context,
1291             char hopType) throws URIException {
1292         addOutLink(createLink(url, context, hopType));
1293     }
1294 
1295     /***
1296      * Convenience method for creating a Link with the given string and
1297      * context, relative to a previously set base HREF if available (or
1298      * relative to the current CrawlURI if no other base has been set)
1299      * 
1300      * @param url String URL to add as destination of link
1301      * @param context String context where link was discovered
1302      * @param hopType char hop-type indicator
1303      * @throws URIException
1304      */
1305     public void createAndAddLinkRelativeToBase(String url,
1306             CharSequence context, char hopType) throws URIException {
1307         addOutLink(new Link(getUURI(), UURIFactory.getInstance(
1308                 getBaseURI(), url), context, hopType));
1309     }
1310     
1311     /***
1312      * Convenience method for creating a Link with the given string and
1313      * context, relative to this CrawlURI's via UURI if available. (If
1314      * a via is not available, falls back to using 
1315      * #createAndAddLinkRelativeToBase.)
1316      * 
1317      * @param url String URL to add as destination of link
1318      * @param context String context where link was discovered
1319      * @param hopType char hop-type indicator
1320      * @throws URIException
1321      */
1322     public void createAndAddLinkRelativeToVia(String url,
1323             CharSequence context, char hopType) throws URIException {
1324         if(getVia()!=null) {
1325             addOutLink(new Link(getUURI(), UURIFactory.getInstance(
1326                 getVia(), url), context, hopType));
1327         } else {
1328             // if no 'via', fall back to base/self
1329             createAndAddLinkRelativeToBase(url,context,hopType);
1330         }
1331     }
1332     
1333     /***
1334      * Set the (HTML) Base URI used for derelativizing internal URIs. 
1335      * 
1336      * @param baseHref String base href to use
1337      * @throws URIException if supplied string cannot be interpreted as URI
1338      */
1339     public void setBaseURI(String baseHref) throws URIException {
1340         putObject(A_HTML_BASE, UURIFactory.getInstance(baseHref));
1341     }
1342       
1343     /***
1344      * Get the (HTML) Base URI used for derelativizing internal URIs. 
1345      *
1346      * @return UURI base URI previously set 
1347      */  
1348     public UURI getBaseURI() {
1349         if (!containsKey(A_HTML_BASE)) {
1350             return getUURI();
1351         }
1352         return (UURI)getObject(A_HTML_BASE);
1353     }
1354     
1355     /***
1356      * Add the key of alist items you want to persist across
1357      * processings.
1358      * @param key Key to add.
1359      */
1360     public static void addAlistPersistentMember(Object key) {
1361         alistPersistentMember.add(key);
1362     }
1363     
1364     /***
1365      * @param key Key to remove.
1366      * @return True if list contained the element.
1367      */
1368     public static boolean removeAlistPersistentMember(Object key) {
1369         return alistPersistentMember.remove(key);
1370     }
1371 
1372     /***
1373      * Custom serialization writing an empty 'outLinks' as null. Estimated
1374      * to save ~20 bytes in serialized form. 
1375      * 
1376      * @param stream
1377      * @throws IOException
1378      */
1379     private void writeObject(ObjectOutputStream stream) throws IOException {
1380         stream.defaultWriteObject();
1381         stream.writeObject((outLinks.isEmpty()) ? null : outLinks);
1382     }
1383 
1384     /***
1385      * Custom deserialization recreating empty HashSet from null in 'outLinks'
1386      * slot. 
1387      * 
1388      * @param stream
1389      * @throws IOException
1390      * @throws ClassNotFoundException
1391      */
1392     private void readObject(ObjectInputStream stream) throws IOException,
1393             ClassNotFoundException {
1394         stream.defaultReadObject();
1395         @SuppressWarnings("unchecked")
1396         Collection<Object> ol = (Collection<Object>) stream.readObject();
1397         outLinks = (ol == null) ? new ArrayList<Object>() : ol;
1398     }
1399 
1400     public long getFetchDuration() {
1401         if(! containsKey(A_FETCH_COMPLETED_TIME)) {
1402             return -1;
1403         }
1404         
1405         long completedTime = getLong(A_FETCH_COMPLETED_TIME);
1406         long beganTime = getLong(A_FETCH_BEGAN_TIME);
1407         return completedTime - beganTime;
1408     }
1409 
1410 
1411 }