Pastie now auto-senses if line-wrap is a bad or good idea. Feedback?
## mark a section (Learn more)
This paste will be private.
Index: lib/rbot/core/utils/httputil.rb =================================================================== --- lib/rbot/core/utils/httputil.rb (revision 809) +++ lib/rbot/core/utils/httputil.rb (working copy) @@ -13,6 +13,7 @@ require 'resolv' require 'net/http' +require 'iconv' begin require 'net/https' rescue LoadError => e @@ -24,17 +25,57 @@ class HTTPResponse # Read chunks from the body until we have at least _size_ bytes, yielding # the partial text at each chunk. Return the partial body. + attr_accessor :no_cache + if !instance_methods.include?('raw_body') + alias :raw_body :body + end + + def body_to_utf(str) + ctype = self['content-type'] || 'text/html' + return str unless ctype =~ /^text/i || ctype =~ /x(ht)?ml/i + charset = 'latin1' # should be in config + + if self['content-type'].match(/charset=["']?([^\s"']+)["']?/i) + debug "charset #{charset} set from header" + charset = $1 + end + + case str + when /<\?xml\s[^>]*encoding=['"]([^\s"'>]+)["'][^>]*\?>/i + charset = $1 + debug "xml charset #{charset} set from xml pi" + when /<(meta\s[^>]*http-equiv=["']?Content-Type["']?[^>]*)>/i + meta = $1 + if meta =~ /charset=['"]?([^\s'";]+)['"]?/ + charset = $1 + debug "html charset #{charset} set from meta" + end + end + + begin + return Iconv.iconv('utf-8', charset, str).join + rescue + debug "conversion failed" + return str + end + end + + def body + return self.body_to_utf(self.raw_body) + end + def partial_body(size=0, &block) + self.no_cache = true partial = String.new self.read_body { |chunk| partial << chunk - yield partial if block_given? + yield self.body_to_utf(partial) if block_given? break if size and size > 0 and partial.length >= size } - return partial + return self.body_to_utf(partial) end end end @@ -85,6 +126,7 @@ def self.maybe_new(resp) debug "maybe new #{resp}" + return nil if resp.no_cache return nil unless Net::HTTPOK === resp || Net::HTTPMovedPermanently === resp || Net::HTTPFound === resp || @@ -298,25 +340,9 @@ if block_given? yield(resp) else + # Net::HTTP wants us to read the whole body here resp.body end - - class << resp.body - def http_headers - if defined?(@http_headers) - @http_headers - else - nil - end - end - - def http_headers=(rsp) - @http_headers=rsp - end - end - - resp.body.http_headers = resp.to_hash - return resp end @@ -417,9 +443,16 @@ elsif Net::HTTPServerError === resp || Net::HTTPClientError === resp debug "http error, deleting cached obj" if cached @cache.delete(cache_key) - elsif opts[:cache] && cached = CachedObject.maybe_new(resp) rescue nil - debug "storing to cache" - @cache[cache_key] = cached + elsif opts[:cache] + begin + return handle_response(uri, resp, opts, &block) + ensure + if cached = CachedObject.maybe_new(resp) rescue nil + debug "storing to cache" + @cache[cache_key] = cached + end + end + return ret end return handle_response(uri, resp, opts, &block) end Index: lib/rbot/core/utils/utils.rb =================================================================== --- lib/rbot/core/utils/utils.rb (revision 809) +++ lib/rbot/core/utils/utils.rb (working copy) @@ -433,7 +433,7 @@ # * :min_spaces => Minimum number of spaces a paragraph should have # def Utils.ircify_first_html_par(xml_org, opts={}) - xml = xml_org.gsub(/<!--.*?-->/m, '').gsub(/<script(?:\s+[^>]*)?>.*?<\/script>/im, "").gsub(/<style(?:\s+[^>]*)?>.*?<\/style>/im, "").utfy_xml + xml = xml_org.gsub(/<!--.*?-->/m, '').gsub(/<script(?:\s+[^>]*)?>.*?<\/script>/im, "").gsub(/<style(?:\s+[^>]*)?>.*?<\/style>/im, "") strip = opts[:strip] strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String) Index: lib/rbot/core/utils/extends.rb =================================================================== --- lib/rbot/core/utils/extends.rb (revision 809) +++ lib/rbot/core/utils/extends.rb (working copy) @@ -27,13 +27,6 @@ end end -begin - require 'iconv' - $we_have_iconv = true -rescue LoadError - $we_have_iconv = false -end - # Extensions to the String class # # TODO make ircify_html() accept an Hash of options, and make riphtml() just @@ -41,44 +34,6 @@ # class ::String - # This method will try to transcode a String supposed to hold an XML or HTML - # document from the original charset to UTF-8. - # - # To find the original encoding, it will first see if the String responds to - # #http_headers(), and if it does it will assume that the charset indicated - # there is the correct one. Otherwise, it will try to detect the charset from - # some typical XML and HTML headers - def utfy_xml - return self unless $we_have_iconv - - charset = nil - - if self.respond_to?(:http_headers) and headers = self.http_headers - if headers['content-type'].first.match(/charset=(\S+?)\s*(?:;|\Z)/i) - debug "charset #{charset} set from header" - charset = $1 - end - end - - if not charset - case self - when /<\?xml.*encoding="(\S+)".*\?>/i - charset = $1 - when /<meta\s+http-equiv\s*=\s*["']?Content-Type["']?.*charset\s*=\s*(\S+?)(?:;|["']|\s).*>/i - charset = $1 - end - debug "charset #{charset} set from string" - end - - if charset - return Iconv.iconv('utf-8', charset, self).join rescue self - else - debug "Couldn't find charset for #{self.inspect}" - return self - end - - end - # This method will return a purified version of the receiver, with all HTML # stripped off and some of it converted to IRC formatting #
From the Design Piracy series on my blog: