strip-stats/espn_pickem_strip_stats.rb

175 lines
4.9 KiB
Ruby

require 'Nokogiri'
require 'open-uri'
require 'tzinfo'
require 'uri'
require './StripStats.rb'
class EspnPickemStripStats < StripStats
###Override method get_data from StripStats class###
@@season_start_month = 8
@@timezone_string = 'US/Eastern'
# Override data getter method and return data extracted from "html_doc" object
def strip_data(html_source)
# Load HTML into object
begin
if File.exists?(html_source) || html_source =~ URI::regexp
html_doc = Nokogiri::HTML(open(html_source))
else
html_doc = Nokogiri::HTML(html_source)
end
rescue Exception => e
puts e.message
end
# Initialize data attribute
data = []
# List of matchup rows in the pickem table
matchup_rows = html_doc.css('table.pickemTable tr.matchupRow')
# Loop through rows
matchup_rows.each do |matchup_row|
row = {}
# Matchup id
row[:matchup_id] = matchup_row.attr('data-matchupid')
# Game id
games_played = matchup_row.css('a.matchupLink')
if games_played.empty? #row could lack link if it was postponed
next
end
game_href = games_played.attr('href').to_s
match_data = game_href.match(/\?gameId=([\d]+)/)
if match_data.nil?
next
end
row[:game_id] = match_data[1]
# Date
date = matchup_row.css('div.pickem-date').first.inner_html.strip
date_parts = date.split(',')
month_and_day = ''
if date_parts.length > 1
month_and_day = date_parts.last.strip
else
month_and_day = date_parts.first.strip
end
month_and_day_parts = month_and_day.split(' ')
month = month_and_day_parts.first.strip
day = month_and_day_parts.last.strip
# Time
url = html_doc.xpath("//head/link[@rel='canonical']/@href").to_s
start_year = url.match(/http:\/\/games.espn.com\/nfl-pigskin-pickem\/([\d]+)/)[1].to_i
year = start_year
month_int = StripStats.month_int_from_string(month)
if month_int < @@season_start_month
year += 1
end
time = matchup_row.css('div.pickem-time').first.inner_html.strip
matches = time.match(/([\d]+):([\d]+)(.+)/)
hour = matches[1].to_s
minute = matches[2].to_s
am_or_pm = matches[3].to_s.upcase
if am_or_pm == 'AM'
hour = '0' if hour == '12'
else
i_hour = hour.to_i
hour = (i_hour + 12).to_s if i_hour < 12
end
tz = TZInfo::Timezone.get(@@timezone_string)
utc = tz.local_to_utc(Time.utc(year.to_s, month, day, hour, minute))
row[:datetime] = utc.strftime('%Y-%m-%d %H:%M:%S %z')
# Media network
tv_networks = matchup_row.css('div.tvNetwork')
unless tv_networks.empty?
row[:network] = tv_networks.first.inner_html.strip
end
# Load divs for each team into array
div_teams = matchup_row.css('td.teams').first.css('div.pickem-teams')
# Away team points
row[:away_team_points] = div_teams.first.css('div.away').first.inner_html.strip
# Home team points
row[:home_team_points] = div_teams.last.css('div.home').first.inner_html.strip
# Away team 3-letter code
row[:away_team] = div_teams.last.css('button').first.attr('data-f').to_s.strip
# Home team 3-letter code
row[:home_team] = div_teams.last.css('button').first.attr('data-u').to_s.strip
# Spread
row[:spread] = div_teams.last.css('button').first.attr('data-s').to_s.strip
# Home team confidence %
divs = matchup_row.css('td.picked').first.css('div.wpwOutsideWrapper')
row[:confidence] = divs.last.css('span').first.inner_html
data<< row
end
return data
end
end
# Main program exectuion
URL_TEMPLATE = 'http://games.espn.com/nfl-pigskin-pickem/%d/en/entry?entryID=%d&period=%d'
OUTPUT_FILE = 'out.ruby.csv'
YEAR_ZERO = 2012 #year when first week corresponds to period=1
WEEKS_PER_SEASON = 17
FIELDS = [
:matchup_id,
:game_id,
:datetime,
:network,
:away_team,
:home_team,
:away_team_points,
:home_team_points,
:spread,
:confidence,
]
# 2015 and 2016
#START_YEAR = 2015
#ENTRY_IDS = [276612, 171981]
# 2016
#START_YEAR = 2016
#ENTRY_IDS = [171981]
# 2017
START_YEAR = 2017
ENTRY_IDS = [537245]
File.open(OUTPUT_FILE, 'w') {} #truncate file
ENTRY_IDS.each_with_index do |entry_id, i|
year = i + START_YEAR
start_period = (year - YEAR_ZERO) * WEEKS_PER_SEASON + 1
end_period = start_period + WEEKS_PER_SEASON - 1
puts "Stripping ESPN Pick'Em data for %d..." % [year]
start_period.upto(end_period).each do |period|
# Get html from URL
html_source = URL_TEMPLATE % [year, entry_id, period]
# Get html from file
#html_source = "period_#{period}.html"
#next unless File.exists?(html_source)
espn_pickem_strip_stats = EspnPickemStripStats.new(html_source, OUTPUT_FILE, FIELDS)
espn_pickem_strip_stats.write_data
puts "Parsing #{html_source}"
end
end