109: def read_page(url, options = nil)
110: options ||= {}
111: redirect_limit = options[:redirect_limit] || REDIRECT_LIMIT
112: raise HTTPRedirectLimitError if redirect_limit == 0
113: if url.is_a?(URI)
114: uri = url
115: else
116: begin
117: uri = URI.parse(url)
118: rescue Exception=>error
119: raise HTTPInvalidURLError.new(error)
120: end
121: end
122: raise HTTPInvalidURLError unless uri.scheme =~ /^http(s?)$/
123: begin
124: http = Net::HTTP.new(uri.host, uri.port)
125: http.use_ssl = (uri.scheme == "https")
126: http.close_on_empty_response = true
127: http.open_timeout = http.read_timeout = options[:http_timeout] || DEFAULT_TIMEOUT
128: path = uri.path.dup
129: path << "?#{uri.query}" if uri.query
130:
131:
132: headers = {}
133: headers["User-Agent"] = options[:user_agent] if options[:user_agent]
134: headers["Last-Modified"] = options[:last_modified] if options[:last_modified]
135: headers["ETag"] = options[:etag] if options[:etag]
136: response = http.request_get(path, headers)
137:
138: rescue TimeoutError=>error
139: raise HTTPTimeoutError.new(error)
140: rescue Exception=>error
141: raise HTTPUnspecifiedError.new(error)
142: end
143: case response
144: when Net::HTTPSuccess
145: encoding = if content_type = response["Content-Type"]
146: if match = content_type.match(/charset=([^\s]+)/i)
147: match[1]
148: end
149: end
150: return Page[(options[:source_url] || uri), response.body, encoding,
151: response["Last-Modified"], response["ETag"]]
152: when Net::HTTPNotModified
153: return Page[(options[:source_url] || uri), nil, nil,
154: options[:last_modified], options[:etag]]
155: when Net::HTTPMovedPermanently
156: return read_page(response["location"],
157: :last_modified=>options[:last_modified],
158: :etag=>options[:etag],
159: :redirect_limit=>redirect_limit-1)
160: when Net::HTTPRedirection
161: return read_page(response["location"],
162: :last_modified=>options[:last_modified],
163: :etag=>options[:etag],
164: :redirect_limit=>redirect_limit-1,
165: :source_url=>(options[:source_url] || uri))
166: when Net::HTTPNotFound
167: raise HTTPNotFoundError
168: when Net::HTTPUnauthorized, Net::HTTPForbidden
169: raise HTTPNoAccessError
170: when Net::HTTPRequestTimeOut
171: raise HTTPTimeoutError
172: else
173: raise HTTPUnspecifiedError
174: end
175: end