strip-stats/espn_pickem_strip_stats.rb
2017-08-25 16:16:33 -05:00

150 lines
4.3 KiB
Ruby

require 'Nokogiri'
require 'open-uri'
require 'tzinfo'
require 'uri'
require './StripStats.rb'
class EspnPickemStripStats < StripStats
###Override method get_data from StripStats class###
# Override data getter method and return data extracted from "html_doc" object
def strip_data(html_source)
# Load HTML into object
begin
if File.exists?(html_source) || html_source =~ URI::regexp
html_doc = Nokogiri::HTML(open(html_source))
else
html_doc = Nokogiri::HTML(html_source)
end
rescue Exception => e
puts e.message
end
# Initialize data attribute
data = []
# List of matchup rows in the pickem table
matchup_rows = html_doc.css('table.pickemTable tr.matchupRow')
# Loop through rows
matchup_rows.each do |matchup_row|
row = {}
# Matchup id
row[:matchup_id] = matchup_row.attr('data-matchupid')
# Game id
game_href = matchup_row.css('a.matchupLink').attr('href').to_s
begin
row[:game_id] = game_href.match(/\?gameId=([\d]+)/)[1]
rescue
row[:game_id] = ''
end
# Date
date = matchup_row.css('div.pickem-date').first.inner_html.strip
date_parts = date.split(',')
month_and_day = ''
if date_parts.length > 1
month_and_day = date_parts.last.strip
else
month_and_day = date_parts.first.strip
end
month_and_day_parts = month_and_day.split(' ')
month = month_and_day_parts.first.strip
day = month_and_day_parts.last.strip
# Time
url = html_doc.xpath("//head/link[@rel='canonical']/@href").to_s
year = url.match(/http:\/\/games.espn.com\/nfl-pigskin-pickem\/([\d]+)/)[1]
time = matchup_row.css('div.pickem-time').first.inner_html.strip
matches = time.match(/([\d]+):([\d]+)(.+)/)
hour = matches[1].to_s
minute = matches[2].to_s
am_or_pm = matches[3].to_s.upcase
if am_or_pm == 'AM'
hour = '0' if hour == '12'
else
i_hour = hour.to_i
hour = (i_hour + 12).to_s if i_hour < 12
end
tz = TZInfo::Timezone.get('US/Eastern')
utc = tz.local_to_utc(Time.utc(year, month, day, hour, minute))
row[:datetime] = utc.strftime('%Y-%m-%d %H:%M:%S %z')
# Media network
row[:network] = matchup_row.css('div.tvNetwork').first.inner_html.strip
# Load divs for each team into array
div_teams = matchup_row.css('td.teams').first.css('div.pickem-teams')
# Away team points
row[:away_team_points] = div_teams.first.css('div.away').first.inner_html.strip
# Home team points
row[:home_team_points] = div_teams.last.css('div.home').first.inner_html.strip
# Away team 3-letter code
row[:away_team] = div_teams.last.css('button').first.attr('data-f').to_s.strip
# Home team 3-letter code
row[:home_team] = div_teams.last.css('button').first.attr('data-u').to_s.strip
# Spread
row[:spread] = div_teams.last.css('button').first.attr('data-s').to_s.strip
# Home team confidence %
divs = matchup_row.css('td.picked').first.css('div.wpwOutsideWrapper')
row[:confidence] = divs.last.css('span').first.inner_html
data<< row
end
return data
end
end
# Main program exectuion
URL_TEMPLATE = 'http://games.espn.com/nfl-pigskin-pickem/%d/en/entry?entryID=%d&period=%d'
OUTPUT_FILE = 'espn_pickem_ruby.csv'
YEAR_ZERO = 2012 #year when first week corresponds to period=1
WEEKS_PER_SEASON = 17
FIELDS = [
:matchup_id,
:game_id,
:datetime,
:network,
:away_team,
:home_team,
:away_team_points,
:home_team_points,
:spread,
:confidence,
]
File.open(OUTPUT_FILE, 'w') {} #truncate file
# 2015 and 2016
START_YEAR = 2015
ENTRY_IDS = [276612, 171981]
ENTRY_IDS.each_with_index do |entry_id, i|
year = i + START_YEAR
start_period = (year - YEAR_ZERO) * WEEKS_PER_SEASON + 1
end_period = start_period + WEEKS_PER_SEASON - 1
puts "Stripping ESPN Pick'Em data for %d..." % [year]
start_period.upto(end_period).each do |period|
# Get html from URL
html_source = URL_TEMPLATE % [year, entry_id, period]
# Get html from file
#html_source = "period_#{period}.html"
#next unless File.exists?(html_source)
espn_pickem_strip_stats = EspnPickemStripStats.new(html_source, OUTPUT_FILE, FIELDS)
espn_pickem_strip_stats.write_data
puts "Parsing #{html_source}"
end
end