require 'Nokogiri' require 'open-uri' require 'tzinfo' require 'uri' require './StripStats.rb' class EspnPickemStripStats < StripStats ###Override method get_data from StripStats class### @@season_start_month = 8 @@timezone_string = 'US/Eastern' # Override data getter method and return data extracted from "html_doc" object def strip_data(html_source) # Load HTML into object begin if File.exists?(html_source) || html_source =~ URI::regexp html_doc = Nokogiri::HTML(open(html_source)) else html_doc = Nokogiri::HTML(html_source) end rescue Exception => e puts e.message end # Initialize data attribute data = [] # List of matchup rows in the pickem table matchup_rows = html_doc.css('table.pickemTable tr.matchupRow') # Loop through rows matchup_rows.each do |matchup_row| row = {} # Matchup id row[:matchup_id] = matchup_row.attr('data-matchupid') # Game id games_played = matchup_row.css('a.matchupLink') if games_played.empty? #row could lack link if it was postponed next end game_href = games_played.attr('href').to_s match_data = game_href.match(/\?gameId=([\d]+)/) if match_data.nil? next end row[:game_id] = match_data[1] # Date date = matchup_row.css('div.pickem-date').first.inner_html.strip date_parts = date.split(',') month_and_day = '' if date_parts.length > 1 month_and_day = date_parts.last.strip else month_and_day = date_parts.first.strip end month_and_day_parts = month_and_day.split(' ') month = month_and_day_parts.first.strip day = month_and_day_parts.last.strip # Time url = html_doc.xpath("//head/link[@rel='canonical']/@href").to_s start_year = url.match(/http:\/\/games.espn.com\/nfl-pigskin-pickem\/([\d]+)/)[1].to_i year = start_year month_int = StripStats.month_int_from_string(month) if month_int < @@season_start_month year += 1 end time = matchup_row.css('div.pickem-time').first.inner_html.strip matches = time.match(/([\d]+):([\d]+)(.+)/) hour = matches[1].to_s minute = matches[2].to_s am_or_pm = matches[3].to_s.upcase if am_or_pm == 'AM' hour = '0' if hour == '12' else i_hour = hour.to_i hour = (i_hour + 12).to_s if i_hour < 12 end tz = TZInfo::Timezone.get(@@timezone_string) utc = tz.local_to_utc(Time.utc(year.to_s, month, day, hour, minute)) row[:datetime] = utc.strftime('%Y-%m-%d %H:%M:%S %z') # Media network tv_networks = matchup_row.css('div.tvNetwork') unless tv_networks.empty? row[:network] = tv_networks.first.inner_html.strip end # Load divs for each team into array div_teams = matchup_row.css('td.teams').first.css('div.pickem-teams') # Away team points row[:away_team_points] = div_teams.first.css('div.away').first.inner_html.strip # Home team points row[:home_team_points] = div_teams.last.css('div.home').first.inner_html.strip # Away team 3-letter code row[:away_team] = div_teams.last.css('button').first.attr('data-f').to_s.strip # Home team 3-letter code row[:home_team] = div_teams.last.css('button').first.attr('data-u').to_s.strip # Spread row[:spread] = div_teams.last.css('button').first.attr('data-s').to_s.strip # Home team confidence % divs = matchup_row.css('td.picked').first.css('div.wpwOutsideWrapper') row[:confidence] = divs.last.css('span').first.inner_html data<< row end return data end end # Main program exectuion URL_TEMPLATE = 'http://games.espn.com/nfl-pigskin-pickem/%d/en/entry?entryID=%d&period=%d' OUTPUT_FILE = 'out.ruby.csv' YEAR_ZERO = 2012 #year when first week corresponds to period=1 WEEKS_PER_SEASON = 17 FIELDS = [ :matchup_id, :game_id, :datetime, :network, :away_team, :home_team, :away_team_points, :home_team_points, :spread, :confidence, ] # 2015 and 2016 #START_YEAR = 2015 #ENTRY_IDS = [276612, 171981] # 2016 #START_YEAR = 2016 #ENTRY_IDS = [171981] # 2017 START_YEAR = 2017 ENTRY_IDS = [537245] File.open(OUTPUT_FILE, 'w') {} #truncate file ENTRY_IDS.each_with_index do |entry_id, i| year = i + START_YEAR start_period = (year - YEAR_ZERO) * WEEKS_PER_SEASON + 1 end_period = start_period + WEEKS_PER_SEASON - 1 puts "Stripping ESPN Pick'Em data for %d..." % [year] start_period.upto(end_period).each do |period| # Get html from URL html_source = URL_TEMPLATE % [year, entry_id, period] # Get html from file #html_source = "period_#{period}.html" #next unless File.exists?(html_source) espn_pickem_strip_stats = EspnPickemStripStats.new(html_source, OUTPUT_FILE, FIELDS) espn_pickem_strip_stats.write_data puts "Parsing #{html_source}" end end