ref: master
app/services/feed_handler.rb
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
require 'feedparser' require 'open-uri' # This class is responsible for processing feeds and pass the items to the # respective container. # # The <tt>max_errors</tt> attribute controls how many times it will retry in # case of failure. If a feed fails for <tt>max_errors+1</tt> times, it will be # disabled and the last error message will be recorder in the container. # The default value is *6*, if you need to change it you can do that in your # config/local.rb file like this: # # FeedHandler.max_errors = 10 # # For the update interval, see FeedUpdater. class FeedHandler # The maximum number cattr_accessor :max_errors cattr_accessor :disabled_period self.max_errors = 6 self.disabled_period = 1.week def parse(content) raise FeedHandler::ParseError, "Content is nil" if content.nil? begin return FeedParser::Feed::new(content.force_encoding('utf-8')) rescue Exception => ex raise FeedHandler::ParseError, "Invalid feed format." end end def fetch(address, header = {}) begin content = "" block = lambda { |s| content = s.read } content = if Rails.env == 'test' && File.exists?(address) File.read(address) else if !valid_url?(address) raise InvalidUrl.new("\"%s\" is not a valid URL" % address) end header.merge!("User-Agent" => "Noosfero/#{Noosfero::VERSION}") open(address, header, &block) end return content rescue Exception => ex raise FeedHandler::FetchError, ex.message end end def fetch_through_proxy(address, environment) header = {} if address.starts_with?("https://") header.merge!(:proxy => environment.https_feed_proxy) if environment.https_feed_proxy else header.merge!(:proxy => environment.http_feed_proxy) if environment.http_feed_proxy end header.merge!(:ssl_verify_mode => OpenSSL::SSL::VERIFY_NONE) if environment.disable_feed_ssl fetch(address, header) end def process(container) begin container.class.transaction do if failed_too_many_times(container) && enough_time_since_last_failure(container) container.enabled = true container.update_errors = 0 container.save end next unless container.enabled actually_process_container(container) container.update_errors = 0 container.finish_fetch end rescue Exception => exception Rails.logger.warn("Unknown error from %s ID %d\n%s" % [container.class.name, container.id, exception.to_s]) Rails.logger.warn("Backtrace:\n%s" % exception.backtrace.join("\n")) container.reload container.update_errors += 1 container.error_message = exception.to_s if container.update_errors > FeedHandler.max_errors container.fetched_at = Time.now container.enabled = false end begin container.finish_fetch rescue Exception => finish_fetch_exception Rails.logger.warn("Unable to finish fetch from %s ID %d\n%s" % [container.class.name, container.id, finish_fetch_exception.to_s]) Rails.logger.warn("Backtrace:\n%s" % finish_fetch_exception.backtrace.join("\n")) end end end class InvalidUrl < Exception; end class ParseError < Exception; end class FetchError < Exception; end protected def actually_process_container(container) container.clear if container.environment.enable_feed_proxy content = fetch_through_proxy(container.address, container.environment) else content = fetch(container.address) end container.fetched_at = Time.now parsed_feed = parse(content) container.feed_title = parsed_feed.title parsed_feed.items[0..container.limit-1].reverse.each do |item| container.add_item(item.title, item.link, item.date, item.content) end end def valid_url?(url) url =~ URI.regexp('http') || url =~ URI.regexp('https') end def failed_too_many_times(container) container.update_errors > FeedHandler.max_errors end def enough_time_since_last_failure(container) container.fetched_at.nil? || container.fetched_at < (Time.now - FeedHandler.disabled_period) end end |