cirandas.net

ref: master

app/services/feed_handler.rb


  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
require 'feedparser'
require 'open-uri'

# This class is responsible for processing feeds and pass the items to the
# respective container.
#
# The <tt>max_errors</tt> attribute controls how many times it will retry in
# case of failure. If a feed fails for <tt>max_errors+1</tt> times, it will be
# disabled and the last error message will be recorder in the container.
# The default value is *6*, if you need to change it you can do that in your
# config/local.rb file like this:
#
#   FeedHandler.max_errors = 10
#
# For the update interval, see FeedUpdater.
class FeedHandler

  # The maximum number
  cattr_accessor :max_errors
  cattr_accessor :disabled_period

  self.max_errors = 6
  self.disabled_period = 1.week

  def parse(content)
    raise FeedHandler::ParseError, "Content is nil" if content.nil?
    begin
      return FeedParser::Feed::new(content.force_encoding('utf-8'))
    rescue Exception => ex
      raise FeedHandler::ParseError, "Invalid feed format."
    end
  end

  def fetch(address, header = {})
    begin
      content = ""
      block = lambda { |s| content = s.read }
      content =
        if Rails.env == 'test' && File.exists?(address)
          File.read(address)
        else
          if !valid_url?(address)
            raise InvalidUrl.new("\"%s\" is not a valid URL" % address)
          end
          header.merge!("User-Agent" => "Noosfero/#{Noosfero::VERSION}")
          open(address, header, &block)
        end
      return content
    rescue Exception => ex
      raise FeedHandler::FetchError, ex.message
    end
  end

  def fetch_through_proxy(address, environment)
    header = {}
    if address.starts_with?("https://")
      header.merge!(:proxy => environment.https_feed_proxy) if environment.https_feed_proxy
    else
      header.merge!(:proxy => environment.http_feed_proxy) if environment.http_feed_proxy
    end
    header.merge!(:ssl_verify_mode => OpenSSL::SSL::VERIFY_NONE) if environment.disable_feed_ssl
    fetch(address, header)
  end

  def process(container)
    begin
      container.class.transaction do
        if failed_too_many_times(container) && enough_time_since_last_failure(container)
          container.enabled = true
          container.update_errors = 0
          container.save
        end
        next unless container.enabled
        actually_process_container(container)
        container.update_errors = 0
        container.finish_fetch
      end
    rescue Exception => exception
      Rails.logger.warn("Unknown error from %s ID %d\n%s" % [container.class.name, container.id, exception.to_s])
      Rails.logger.warn("Backtrace:\n%s" % exception.backtrace.join("\n"))
      container.reload
      container.update_errors += 1
      container.error_message = exception.to_s
      if container.update_errors > FeedHandler.max_errors
        container.fetched_at = Time.now
        container.enabled = false
      end
      begin
        container.finish_fetch
      rescue Exception => finish_fetch_exception
        Rails.logger.warn("Unable to finish fetch from %s ID %d\n%s" % [container.class.name, container.id, finish_fetch_exception.to_s])
        Rails.logger.warn("Backtrace:\n%s" % finish_fetch_exception.backtrace.join("\n"))
      end
    end
  end

  class InvalidUrl < Exception; end
  class ParseError < Exception; end
  class FetchError < Exception; end

  protected

  def actually_process_container(container)
    container.clear
    if container.environment.enable_feed_proxy
      content = fetch_through_proxy(container.address, container.environment)
    else
      content = fetch(container.address)
    end
    container.fetched_at = Time.now
    parsed_feed = parse(content)
    container.feed_title = parsed_feed.title
    parsed_feed.items[0..container.limit-1].reverse.each do |item|
      container.add_item(item.title, item.link, item.date, item.content)
    end
  end

  def valid_url?(url)
    url =~ URI.regexp('http') || url =~ URI.regexp('https')
  end

  def failed_too_many_times(container)
    container.update_errors > FeedHandler.max_errors
  end

  def enough_time_since_last_failure(container)
    container.fetched_at.nil? || container.fetched_at < (Time.now - FeedHandler.disabled_period)
  end

end