ref: master
plugins/lattes_curriculum/lib/html_parser.rb
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
require 'nokogiri' require 'open-uri' Encoding.default_external = Encoding::UTF_8 Encoding.default_internal = Encoding::UTF_8 class Html_parser def get_html(lattes_link = "") begin page = Nokogiri::HTML(open(lattes_link), nil, "UTF-8") page = page.css(".main-content").to_s() page = remove_class_tooltip(page) page = remove_img(page) page = remove_select(page) page = remove_footer(page) page = remove_further_informations(page) rescue OpenURI::HTTPError => e page = _("Lattes not found. Please, make sure the informed URL is correct.") rescue Timeout::Error => e page = _("Lattes Platform is unreachable. Please, try it later.") rescue page = _("Could not import the lattes") end end def remove_class_tooltip(page = "") while page.include? 'class="tooltip"' do page['class="tooltip"'] = 'class="link_not_to_mark"' end return page end def remove_img(page = "") fist_part_to_keep, *rest = page.split('<img') second_part = rest.join(" ") part_to_throw_away, *after_img = second_part.split('>',2) page = fist_part_to_keep + after_img.join(" ") end def remove_select(page = "") while page.include? '<label' do first_part_to_keep, *rest = page.split('<label') second_part = rest.join(" ") part_to_throw_away, *after_img = second_part.split('</select>') page = first_part_to_keep + after_img.join(" ") end return page end def remove_footer(page = "") first_part_to_keep, *rest = page.split('<div class="rodape-cv">') second_part = rest.join(" ") part_to_throw_away, *after_img = second_part.split('Imprimir Currículo</a>') page = first_part_to_keep + after_img.join(" ") end def remove_further_informations(page = "") first_part_to_keep, *rest = page.split('<a name="OutrasI') second_part = rest.join(" ") part_to_throw_away, *after_img = second_part.split('</div>',2) page = first_part_to_keep + after_img.join(" ") end end |