1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 package org.archive.crawler.deciderules;
28
29 import java.util.logging.Logger;
30
31 import javax.management.AttributeNotFoundException;
32
33 import org.archive.crawler.settings.SimpleType;
34
35 /***
36 * Compares suffix of a passed CrawlURI, UURI, or String against a regular
37 * expression pattern, applying its configured decision to all matches.
38 *
39 * Several predefined patterns are available for convenience. Choosing
40 * 'custom' makes this the same as a regular MatchesRegExpDecideRule.
41 *
42 * @author Igor Ranitovic
43 */
44 public class MatchesFilePatternDecideRule extends MatchesRegExpDecideRule {
45
46 private static final long serialVersionUID = -4182743018517062411L;
47
48 private static final Logger logger =
49 Logger.getLogger(MatchesFilePatternDecideRule.class.getName());
50 public static final String ATTR_USE_PRESET = "use-preset-pattern";
51 public static final String IMAGES_PATTERNS =
52 ".*(?i)(//.(bmp|gif|jpe?g|png|svg|tiff?))$";
53 public static final String AUDIO_PATTERNS =
54 ".*(?i)(//.(aac|aiff?|m3u|m4a|midi?|mp2|mp3|mp4|mpa|ogg|ra|ram|wav|wma))$";
55 public static final String VIDEO_PATTERNS =
56 ".*(?i)(//.(asf|asx|avi|flv|mov|mp4|mpeg|mpg|qt|ram|rm|smil|wmv))$";
57 public static final String MISC_PATTERNS =
58 ".*(?i)(//.(doc|pdf|ppt|swf))$";
59 public static final String ALL_DEFAULT_PATTERNS =
60 ".*(?i)(//.(bmp|gif|jpe?g|png|svg|tiff?|aac|aiff?|m3u|m4a|midi?|mp2" +
61 "|mp3|mp4|mpa|ogg|ra|ram|wav|wma|asf|asx|avi|flv|mov|mp4|mpeg|mpg|qt" +
62 "|ram|rm|smil|wmv|doc|pdf|ppt|swf))$";
63
64 public static final String ALL = "All";
65 public static final String IMAGES = "Images";
66 public static final String AUDIO = "Audio";
67 public static final String VIDEO = "Video";
68 public static final String MISC = "Miscellaneous";
69 public static final String CUSTOM = "Custom";
70
71 /***
72 * Usual constructor.
73 * @param name
74 */
75 public MatchesFilePatternDecideRule(String name) {
76 super(name);
77 setDescription("MatchesFilePatternDecideRule. Applies its decision " + "to all URIs that end with the specified pattern(s). Anything " +
78 " that does not match is let PASS. " +
79 " Default file patterns are: .avi, .bmp, " +
80 ".doc, .gif, .jp(e)g, .mid, .mov, .mp2, .mp3, .mp4, .mpeg, " +
81 ".pdf, .png, .ppt, .ram, .rm,.smil, .swf, .tif(f), .wav, .wmv. " +
82 "It is also possible to specify a custom regular expression, " +
83 "in which case this behaves exactly like the " +
84 " MatchesRegExpDecideRule. See also " +
85 "NotMatchesFilePatternDecideRule.");
86
87 String[] options = new String[] {ALL, IMAGES, AUDIO, VIDEO, MISC,
88 CUSTOM};
89
90 addElementToDefinition(
91 new SimpleType(ATTR_USE_PRESET, "URIs that match selected file " +
92 "patterns will have the decision applied. Default file " +
93 "patterns are:\n" +
94 "Images: .bmp, .gif, .jp(e)g, .png, .tif(f)\n" +
95 "Audio: .mid, mp2, .mp3, .mp4, .wav\n" +
96 "Video: .avi, .mov, .mpeg, .ram, .rm, .smil, .wmv\n" +
97 "Miscellaneous: .doc, .pdf, .ppt, .swf\n" +
98 "All: All above patterns\n" +
99 "Choose 'Custom' to specify your own pattern. Preset " +
100 "patterns are case insensitive.",
101 "All", options));
102
103 addElementToDefinition(
104 new SimpleType(ATTR_REGEXP, "Custom java regular expression. " +
105 "This regular expression will be used instead of the " +
106 "supplied pattern groups for matching. An example " +
107 "of such a regular expression (Miscellaneous): " +
108 ".*(?i)(//.(doc|pdf|ppt|swf))$ " +
109 "Any arbitrary regular expression may be entered and " +
110 "will be applied to the URI.", ""));
111 }
112
113 /***
114 * Use a preset if configured to do so.
115 * @param o Context
116 * @return Regex to use.
117 *
118 * @see org.archive.crawler.filter.URIRegExpFilter#getRegexp(Object)
119 */
120 protected String getRegexp(Object o) {
121 try {
122 String patternType = (String) getAttribute(o, ATTR_USE_PRESET);
123 if (patternType.equals(ALL)) {
124 return ALL_DEFAULT_PATTERNS;
125 } else if (patternType.equals(IMAGES)) {
126 return IMAGES_PATTERNS;
127 } else if (patternType.equals(AUDIO)) {
128 return AUDIO_PATTERNS;
129 } else if (patternType.equals(VIDEO)) {
130 return VIDEO_PATTERNS;
131 } else if (patternType.equals(MISC)) {
132 return MISC_PATTERNS;
133 } else if (patternType.equals(CUSTOM)) {
134 return super.getRegexp(o);
135 } else {
136 assert false : "Unrecognized pattern type " + patternType
137 + ". Should never happen!";
138 }
139 } catch (AttributeNotFoundException e) {
140 logger.severe(e.getMessage());
141 }
142 return null;
143 }
144 }