|
USE:
path = An array of paths to search.
eg. path = ['/','/index2.html','index3.html']
You can also use a shorthand notation for numerically consecutive pages.
eg. path = ['/','/index[2..3].html']
filter = An array of filters to avoid logos, directories, etc. The filter can either contain an exact path string and/or a regular expression. The following will prevent any image found in a "/grfx/" or "/banner/" folder or subfolder, case-insensitive:
eg. filter = [/\/grfx\/*/i,/\/banner\/*/i]
min_size_k = The minimum file size (in kilobytes) for an image to be grabbed
iSlurp.rb 1 require 'net/http'
2 require 'FileUtils'
3
4 5 site = 'www.google.com'
6 path = ['/']
7
8 $filter = []
9 $min_size_k = 1
10 11
12 $imgid = '0000'
13
14 def msg(text)
15 puts text
16 STDOUT.flush
17 end
18
19 def getImage(site,path,imgpath,imgsize)
20 imgfile = ''
21 imgfolder = (site+path).gsub('/','_')
22 if imgpath[0] == '/'
23 imgfile = imgpath
24 else
25 if not /\.[a-z0-9]*/i.match(path).nil?
26 path = path.split('/')
27 path.compact!
28 path.pop
29 path = path.join('/')
30 end
31 imgfile = (path+'/'+imgpath).gsub('//','/')
32 end
33 filter_match = false
34 $filter.each do |fm|
35 if fm.class == Regexp and not fm.match(imgfile).nil?
36 filter_match = true
37 elsif imgfile == fm
38 puts "got here!"
39 filter_match = true
40 end
41 end
42 return unless not filter_match
43 Net::HTTP.start(site) do |http|
44 response = http.request_head(imgfile)
45 if response.code == '200' and
46 (response['content-type'] == 'image/jpeg') and
47 response['content-length'].to_i >= imgsize
48 49 response = http.get(imgfile)
50 ext = imgfile.split('.')
51 ext = ext.pop.gsub('/','')
52
53 if not File.exist?(imgfolder)
54 FileUtils.mkdir(imgfolder)
55 end
56 imgstr = "#{imgfolder}/image_#{$imgid}.#{ext}"
57 File.open(imgstr,'wb') do |f|
58 f.puts response.body
59 end
60 isize = response['content-length'].to_i/1024.0
61 mess = "Slurped: http://#{site}#{imgfile} (##{$imgid} - %2.2fk)"%isize
62 $log << mess
63 msg(mess)
64 $html_page << "<img src='#{imgstr}'>"+'<br>'+mess
65 $imgid.succ!
66 end
67 end
68 end
69
70 def getImageList(site,path)
71 imgs = Array.new
72
73 msg("---Connecting: #{site}#{path}")
74 Net::HTTP.start(site) do |http|
75 response = http.get(path)
76 77 78 79 txt = response.body
80 while not txt.nil? do
81 images = /src=\"[^\"]+\.(jpg|jpeg)\"/i.match(txt)
82 imgs << images.to_s.gsub(/src=\"/i,'').gsub('"','')
83 if not images.nil?
84 txt = images.post_match
85 else
86 txt = nil
87 end
88 end
89 end
90 imgs.uniq!
91 imgs.each {|i| if not i.empty? then $log << "Found: #{i}"; msg("Found: #{i}") end}
92 $log << "------------------------"
93 msg("------------------------")
94 imgs
95 end
96
97 def getLinkList(site,path)
98 paths = Array.new
99
100 101 Net::HTTP.start(site) do |http|
102 response = http.get(path)
103 104 105 106 txt = response.body
107 while not txt.nil? do
108 links = /href=\"[^\"]*\"/i.match(txt)
109 paths << links.to_s.gsub(/href=\"/i,'').gsub('"','')
110 if not links.nil?
111 txt = links.post_match
112 else
113 txt = nil
114 end
115 end
116 end
117 paths.uniq!
118 bad = Array.new
119 paths.each {|p| if not /(http:\/\/|mailto:|ftp:\/\/)/i.match(p).nil? then bad << p end}
120 paths.each {|p| if /^[a-z0-9\_\-\.\/\&\=\?]+/i.match(p).nil? then bad << p end}
121 paths.each {|p|
122 if not /\.(zip|js|ico|xml|css|wml|fla|swf|mov|mpg|mpeg|avi|rm|mp3)$/i.match(p).nil?
123 bad << p
124 end
125 }
126 paths = paths - bad
127 paths.map! {|p| if p[0] != '/' then p = '/'+p end}
128 paths.each {|p| if not p.empty? then $log << "Path Found: #{p}"end} 129 $log << "------------------------"
130 131 paths
132 end
133
134 def slurp(site,path)
135 imgsize = $min_size_k*1024
136 $log = Array.new
137 $html_page = Array.new
138 $site_imgs = Array.new
139 140 141 links = getLinkList(site,path)
142 $site_imgs = getImageList(site,path)
143 144 145 146 $site_imgs.each do |img|
147 if not img.empty?
148 getImage(site,path,img,imgsize)
149 end
150 end
151 safe_path = "#{site}#{path}".gsub('/','_')
152 File.open("!log_#{safe_path}.txt",'w') do |f|
153 f.puts "-----#{Time.now}-----"
154 f.puts "---URL: #{site}#{path}---"
155 f.print $log.join("\n")
156 end
157 File.open("!summary_#{safe_path}.html",'w') do |f|
158 f.puts '<html><head></head><body>',
159 '<div align="center" style="font-size:10px;font-family:verdana;">'
160 f.puts "<h1><a href='http://#{site}#{path}' target='_blank'>#{site}#{path}</a></h1>"
161 f.puts $html_page.join('<br><br>')
162 f.puts '</div></body></html>'
163 end
164 end
165
166 167 path.each do |p|
168 if not /\[([0-9]+)\.\.([0-9]+)\]/.match(p).nil?
169 $1.to_i.upto($2.to_i) do |i|
170 subp = p.gsub(/\[([0-9]+)\.\.([0-9]+)\]/,"#{i}")
171 slurp(site,subp)
172 end
173 else
174 slurp(site,p)
175 end
176 end
|