View Javadoc

1   /* HtmlFormCredential
2    *
3    * Created on Apr 7, 2004
4    *
5    * Copyright (C) 2004 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.datamodel.credential;
24  
25  import java.util.HashMap;
26  import java.util.Iterator;
27  import java.util.Map;
28  import java.util.logging.Logger;
29  
30  import javax.management.Attribute;
31  import javax.management.AttributeNotFoundException;
32  
33  import org.apache.commons.httpclient.HttpClient;
34  import org.apache.commons.httpclient.HttpMethod;
35  import org.apache.commons.httpclient.HttpMethodBase;
36  import org.apache.commons.httpclient.NameValuePair;
37  import org.apache.commons.httpclient.URIException;
38  import org.apache.commons.httpclient.methods.GetMethod;
39  import org.apache.commons.httpclient.methods.PostMethod;
40  import org.apache.commons.lang.StringUtils;
41  import org.archive.crawler.datamodel.CrawlURI;
42  import org.archive.crawler.settings.MapType;
43  import org.archive.crawler.settings.SimpleType;
44  import org.archive.crawler.settings.Type;
45  import org.archive.net.UURI;
46  import org.archive.net.UURIFactory;
47  
48  
49  
50  /***
51   * Credential that holds all needed to do a GET/POST to a HTML form.
52   *
53   * @author stack
54   * @version $Revision: 5913 $, $Date: 2008-07-28 22:34:52 +0000 (Mon, 28 Jul 2008) $
55   */
56  public class HtmlFormCredential extends Credential {
57  
58      private static final long serialVersionUID = -4732570804435453949L;
59  
60      private static final Logger logger =
61          Logger.getLogger(HtmlFormCredential.class.getName());
62  
63      private static final String ATTR_LOGIN_URI = "login-uri";
64      private static final String ATTR_FORM_ITEMS = "form-items";
65      private static final String ATTR_FORM_METHOD = "http-method";
66      private static final String [] METHODS = {"POST", "GET"};
67  
68      /***
69       * Constructor.
70       *
71       * A constructor that takes name of the credential is required by settings
72       * framework.
73       *
74       * @param name Name of this credential.
75       */
76      public HtmlFormCredential(final String name)
77      {
78          super(name, "Credential that has all necessary" +
79              " for running a POST/GET to an HTML login form.");
80  
81          Type t = addElementToDefinition(new SimpleType("login-uri",
82              "Full URI of page that contains the HTML login form we're to" +
83              " apply these credentials too: E.g. http://www.archive.org", ""));
84          t.setOverrideable(false);
85          t.setExpertSetting(true);
86  
87  
88          t = addElementToDefinition(new SimpleType(ATTR_FORM_METHOD,
89              "GET or POST", METHODS[0], METHODS));
90          t.setOverrideable(false);
91          t.setExpertSetting(true);
92  
93          t = addElementToDefinition(new MapType(ATTR_FORM_ITEMS, "Form items.",
94              String.class));
95          t.setOverrideable(false);
96          t.setExpertSetting(true);
97      }
98  
99      /***
100      * @param context CrawlURI context to use.
101      * @return login-uri.
102      * @throws AttributeNotFoundException
103      */
104     public String getLoginUri(final CrawlURI context)
105             throws AttributeNotFoundException {
106         return (String)getAttribute(ATTR_LOGIN_URI, context);
107     }
108 
109     /***
110      * @param context CrawlURI context to use.
111      * @return login-uri.
112      * @throws AttributeNotFoundException
113      */
114     public String getHttpMethod(final CrawlURI context)
115             throws AttributeNotFoundException {
116         return (String)getAttribute(ATTR_FORM_METHOD, context);
117     }
118 
119     /***
120      * @param context CrawlURI context to use.
121      * @return Form inputs as convenient map.  Returns null if no form items.
122      * @throws AttributeNotFoundException
123      */
124     public Map<String,Object> getFormItems(final CrawlURI context)
125             throws AttributeNotFoundException {
126         Map<String,Object> result = null;
127         MapType items = (MapType)getAttribute(ATTR_FORM_ITEMS, context);
128         if (items != null) {
129             for (Iterator i = items.iterator(context); i.hasNext();) {
130                 Attribute a = (Attribute)i.next();
131                 if (result == null) {
132                     result = new HashMap<String,Object>();
133                 }
134                 result.put(a.getName(), a.getValue());
135             }
136         }
137         return result;
138     }
139 
140     public boolean isPrerequisite(final CrawlURI curi) {
141         boolean result = false;
142         String curiStr = curi.getUURI().toString();
143         String loginUri = getPrerequisite(curi);
144         if (loginUri != null) {
145             try {
146                 UURI uuri = UURIFactory.getInstance(curi.getUURI(), loginUri);
147                 if (uuri != null && curiStr != null &&
148                     uuri.toString().equals(curiStr)) {
149                     result = true;
150                     if (!curi.isPrerequisite()) {
151                         curi.setPrerequisite(true);
152                         logger.fine(curi + " is prereq.");
153                     }
154                 }
155             } catch (URIException e) {
156                 logger.severe("Failed to uuri: " + curi + ", " +
157                     e.getMessage());
158             }
159         }
160         return result;
161     }
162 
163     public boolean hasPrerequisite(CrawlURI curi) {
164         return getPrerequisite(curi) != null;
165     }
166 
167     public String getPrerequisite(CrawlURI curi) {
168         String loginUri = null;
169         try {
170             loginUri = getLoginUri(curi);
171         } catch (AttributeNotFoundException e) {
172             logger.severe("Failed to getLoginUri: " + this + ", " + curi + ","
173                 + e.getMessage());
174             // Not much I can do here. What if I fail every time? Then
175             // this prereq. will not ever be processed.  We'll never get on to
176             // this server.
177         }
178         return loginUri;
179     }
180 
181     public String getKey(CrawlURI curi) throws AttributeNotFoundException {
182         return getLoginUri(curi);
183     }
184 
185     public boolean isEveryTime() {
186         // This authentication is one time only.
187         return false;
188     }
189 
190     public boolean populate(CrawlURI curi, HttpClient http, HttpMethod method,
191             String payload) {
192         // http is not used.
193         // payload is not used.
194         boolean result = false;
195         Map formItems = null;
196         try {
197             formItems = getFormItems(curi);
198         }
199         catch (AttributeNotFoundException e1) {
200             logger.severe("Failed get of form items for " + curi);
201         }
202         if (formItems == null || formItems.size() <= 0) {
203             try {
204                 logger.severe("No form items for " + method.getURI());
205             }
206             catch (URIException e) {
207                 logger.severe("No form items and exception getting uri: " +
208                     e.getMessage());
209             }
210             return result;
211         }
212 
213         NameValuePair[] data = new NameValuePair[formItems.size()];
214         int index = 0;
215         String key = null;
216         for (Iterator i = formItems.keySet().iterator(); i.hasNext();) {
217             key = (String)i.next();
218             data[index++] = new NameValuePair(key, (String)formItems.get(key));
219         }
220         if (method instanceof PostMethod) {
221             ((PostMethod)method).setRequestBody(data);
222             result = true;
223         } else if (method instanceof GetMethod) {
224             // Append these values to the query string.
225             // Get current query string, then add data, then get it again
226             // only this time its our data only... then append.
227             HttpMethodBase hmb = (HttpMethodBase)method;
228             String currentQuery = hmb.getQueryString();
229             hmb.setQueryString(data);
230             String newQuery = hmb.getQueryString();
231             hmb.setQueryString(
232                     ((StringUtils.isNotEmpty(currentQuery))
233                             ? currentQuery + "&"
234                             : "")
235             		+ newQuery);
236             result = true;
237         } else {
238             logger.severe("Unknown method type: " + method);
239         }
240         return result;
241     }
242 
243     public boolean isPost(CrawlURI curi) {
244         String method = null;
245         try {
246             method = getHttpMethod(curi);
247         }
248         catch (AttributeNotFoundException e) {
249             logger.severe("Failed to get method for " + curi + ", " + this);
250         }
251         return method != null && method.equalsIgnoreCase("POST");
252     }
253 }