-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwayback_proxy_server.rb
371 lines (316 loc) · 10.2 KB
/
wayback_proxy_server.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
# encoding: UTF-8
class WaybackProxyServer
CR = "\x0d"
LF = "\x0a"
CRLF = "\x0d\x0a"
# TODO: ...
# - SSL support
# - POST requests
# - PUT requests
# - HEAD requests
# - DELETE requests
def initialize(*args)
@opts = args.extract_options!
@cache = @opts[:cache]
@threads = []
@logger = @opts[:logger]
puts "Starting Wayback Server on #{host}:#{port}..." if DEBUG
end
def server
return @server unless @server.nil?
@server = TCPServer.new(host, port)
begin
if ssl_configured?
# context = OpenSSL::SSL::SSLContext.new
# context.verify_mode = OpenSSL::SSL::VERIFY_NONE
# context.cert = OpenSSL::X509::Certificate.new( File.read(@opts[:ssl][:cert]) )
# context.key = OpenSSL::PKey::RSA.new( File.read(@opts[:ssl][:key]) )
# @sslserver = OpenSSL::SSL::SSLServer.new(@server, context)
# # @sslserver.start_immediately = true
# @server = @sslserver
# puts "...using OpenSSL" if DEBUG
end
rescue => err
puts "OpenSSL Error: #{err}"
end
@server
end
def ssl_configured?
false # File.exists?(@opts[:ssl][:cert]) and File.exists?(@opts[:ssl][:key])
end
# def log(s,m)
# return if @logger.blank?
# @logger.
# end
def whitelist
@whitelist ||= File.readlines(File.join(APP_ROOT, 'whitelist.txt')).to_a.map{|v| v.match(/^(\r)?\n$/) ? nil : v.gsub(/(\r)?\n/m, '')}.compact rescue []
end
# Wrapper for cache, if configured
def cache(key)
if @cache
begin
sha_key = Digest::SHA1.hexdigest(key)
result = @cache.get(sha_key) rescue nil
unless result
result = Proc.new{ yield }.call
@cache.set(sha_key, result)
end
result
rescue => err
puts "Caching Error: #{key} :: #{err}"
result = Proc.new{ yield }.call
end
else
result = Proc.new{ yield }.call
end
result
end
# Core server compontent that includes loop with Thread requests.
def run
loop do
Thread.start(server.accept) do |session|
Thread.current[:session] = session
fam, port, *addr = session.getpeername.unpack('nnC4')
Thread.current[:request_ip] = addr.join('.')
begin
request, resp = '', nil
Thread.current[:redirect_count] = 0
while (r = Thread.current[:session].gets)
break if r =~ /^\s*$/
request << r.chomp
end
Thread.current[:request] = request
# Get the method and URL from the request string
http_request = request.lines.first # first line
Thread.current[:request_method] = http_request.gsub(/^([A-Z]+)(.*)$/i, '\1').downcase.to_sym rescue nil
uri = Addressable::URI.parse(http_request.gsub(/^([A-Z]+)/, '').gsub(/(\sHTTP.*)/, '')) rescue nil
rescue => err
handle_error(__method__, err)
Thread.current[:session].write(http_error(:failure))
Thread.current[:session].close
Thread.current.exit
end
begin
resp = fetch(uri)
Thread.current[:session].write(resp)
rescue Errno::EPIPE, Errno::ECONNRESET => err
unless count > max_retries
count += 1
retry
else
handle_error(__method__, err)
Thread.current[:session].write(http_error(:failure))
end
rescue => err
handle_error(__method__, err)
Thread.current[:session].write(http_error(:failure))
end
Thread.current[:session].close rescue nil
Thread.current.exit
end
end
end
# Method to fetch a URI, determine method for request, and handle certain errors.
def fetch(uri)
return http_error(:bad_request) if uri.nil?
return http_error(:too_many_redirects) if Thread.current[:redirect_count] > max_redirects
puts "Fetch #{Thread.current[:request_ip]}: #{Thread.current[:request_method]}: #{uri}" if DEBUG
case Thread.current[:request_method]
when :get
get_request(uri)
when :post
post_request(uri)
when :put
put_request(uri)
when :head
head_request(uri)
when :delete
delete_request(uri)
when :connect
connect_request(uri)
when :options
options_request(uri)
when :trace
trace_request(uri)
else
http_error(:not_implemented)
end rescue nil
end
# Handle GET requests
def get_request(uri)
# Get Wayback URI if URI seems like it would be an item that is archived.
begin
uri.scheme = 'http' unless uri.scheme
uri.host = 'web.archive.org' unless uri.host
if !uri.host.match(/archive\.org$/i) && determine_page_type(uri) == :unknown
uri = get_wayback_uri(uri, :first)
end
rescue => err
handle_error(__method__, err)
end
# GET request
begin
cache("get:#{uri}") do
req = Net::HTTP::Get.new(uri.path, default_opts.merge({}))
resp = Net::HTTP.start(uri.host, uri.port){|http| http.request(req)}
parse_response(resp)
end
rescue => err
handle_error(__method__, err)
http_error(:failure)
end
end
# Handle POST requests
def post_request
puts "POST not implemented: #{uri}" if DEBUG
http_error(:not_implemented)
end
# Handle HEAD requests
def head_request
puts "HEAD not implemented: #{uri}" if DEBUG
http_error(:not_implemented)
end
# Handle PUT requests
def put_request
puts "PUT not implemented: #{uri}" if DEBUG
http_error(:not_implemented)
end
# Handle DELETE requests
def delete_request
puts "DELETE not implemented: #{uri}" if DEBUG
http_error(:not_implemented)
end
# Handle CONNECT requests
def connect_request(uri)
if @opts[:allow_ssl]
begin
reqhost, reqport = uri.to_s.split(":", 2)
begin
os = TCPSocket.new(reqhost, reqport)
Thread.current[:session].write(http_success)
rescue => err
puts ("CONNECT #{reqhost}:#{reqport}: failed `#{err.message}'")
Thread.current[:session].write(http_error(:bad_gateway))
ensure
Thread.current[:session].write("\r\n") # Flush headers
end
begin
Timeout::timeout(5) {
while fds = IO::select([Thread.current[:session], os],nil,nil,5000)
if fds[0].member?(Thread.current[:session])
buf = Thread.current[:session].sysread(1024)
os.syswrite(buf)
elsif fds[0].member?(os)
buf = os.sysread(1024)
Thread.current[:session].syswrite(buf)
end
end
}
rescue Timeout::Error
nil
rescue => err
handle_error(__method__, err)
ensure
os.close
end
rescue => err
handle_error(__method__, err)
nil
end
else
http_error(:failure)
end
end
# Handle OPTIONS requests
def options_request(uri)
puts "OPTIONS not implemented: #{uri}" if DEBUG
http_error(:not_implemented)
end
# Handle TRACE requests
def trace_request(uri)
puts "TRACE not implemented: #{uri}" if DEBUG
http_error(:not_implemented)
end
# Parse the Net::HTTP response
def parse_response(resp,i=0)
case resp
when Net::HTTPSuccess
content = http_success
# Get Headers
resp.each_header do |h,v|
next if ['transfer-encoding','connection','link','server','content-length'].include?(h.downcase)
next if h.match(/^x\-/i) || h.match(/^memento/)
content << h + ": " + v + "\r\n"
end
content << "\r\n"
content << resp.body
content << "\r\n"
content.gsub!(/(\=(\s+)?(\"|\')?)(\\\/web\\\/\d+)/m, "\\1http:\\/\\/web.archive.org\\4")
content.gsub!(/(\=(\s+)?(\"|\')?)(\/web\/\d+)/m, "\\1http://web.archive.org\\4")
content.gsub!(/(\<\!\-\- BEGIN WAYBACK TOOLBAR INSERT \-\-\>)(.*)(\<\!\-\- END WAYBACK TOOLBAR INSERT \-\-\>)/im, '')
content
when Net::HTTPRedirection
Thread.current[:redirect_count] += 1
new_uri = Addressable::URI.parse(resp['location'])
fetch(new_uri)
else
http_error(:failure)
end
end
# Check what possible page_type a uri might be based on it's path.
def determine_page_type(uri)
return :image if uri.path.match(/\.(png|gif|jpg|jpeg|bmp|svg|ico)$/i)
return :document if uri.path.match(/\.(doc|docx|xls|xlsx|csv|txt|pdf|md)$/i)
return :media if uri.path.match(/\.(mp4|mp3|avi|wma|wmv|acc|m4a|ogg|mov|flv|mpg|mpeg)$/i)
return :file if uri.path.match(/\.(css|js|xml|rss|rdf|json)$/i)
return :unknown
end
# Get the URI for the Wayback page, if available
def get_wayback_uri(uri,t=:first)
cache("wayback:#{uri}:#{t}") do
begin
if whitelisted?(uri)
uri
else
resp = Wayback.available(uri,t)
if resp && resp[:url] && resp[:available]
Addressable::URI.parse(resp[:url])
else
uri
end
end
rescue => err
uri
end
end
end
# Echo out error information and backtrace.
def handle_error(m,err)
puts "Error: #{err} in #{m || 'unknown'}" # if DEBUG
err.backtrace.map{|l| puts " #{l}"} if DEBUG
end
def whitelisted?(uri)
whitelist.each{|v| return true if uri.to_s.match(Regexp.new(Regexp.escape(v), true))}
return false
end
# Default options
def default_opts; {'User-Agent' => WAYBACK_PROXY_USER_AGENT}; end
def host; @opts[:host] || 'localhost'; end
def port; @opts[:port] || 8888; end
def max_redirects; WAYBACK_PROXY_MAX_REDIRECTS || 5; end
def max_retries; WAYBACK_PROXY_MAX_RETRIES || 5; end
# HTTP status messages
def http_success; "HTTP/1.1 200 OK\r\n"; end
def http_bad_request; "HTTP/1.1 400 Bad Request\r\n"; end
def http_failure; "HTTP/1.1 404 Not Found\r\n"; end
def http_not_implemented; "HTTP/1.1 501 Not Implemented\r\n"; end
def http_bad_gateway; "HTTP/1.1 503 Bad Gateway\r\n"; end
def http_too_many_redirects; "HTTP/1.1 504 Gateway Timeout\r\n"; end
def http_error(s)
str = send("http_#{s}")
i = str.gsub(/^(.*)(\d{3})(.*)$/m, '\2')
str << "\r\n"
str << File.read(File.join(APP_ROOT,"pages/#{i.to_s}.html")) rescue ':('
str
end
end