strip-stats/espn_pickem_strip_stats.py

157 lines
5.5 KiB
Python

"""Strip stats from ESPN Pick'Em"""
from pprint import pprint
import re
#from datetime import datetime
import httplib2
#from pytz import timezone
#import pytz
import sys
try:
from BeautifulSoup import BeautifulSoup
except ImportError:
from bs4 import BeautifulSoup
from StripStats import StripStats
class EspnPickemStripStats(StripStats):
"""Override method get_data from StripStats class"""
# Class variables
season_start_month = 8 #needed to handle case when season starts and finishes in different years and the
#current year must be inferred
# Override abstract method and return data extracted from "html_doc" object
def strip_data(self, url):
http = httplib2.Http()
headers, body = http.request(url)
# Load HTML into an object
html_doc = BeautifulSoup(body, 'html.parser')
# Initialize data attribute
data = []
# Pick'Em table
pickem_table = html_doc.find('table', class_='pickemTable')
# List of matchup rows
matchup_rows = pickem_table.find_all('tr', class_='matchupRow')
# Loop through rows
for j in range(0, len(matchup_rows)):
row = {}
matchup_row = matchup_rows[j]
# Matchup id
row['matchup_id'] = matchup_row['data-matchupid']
# Game id
game_href = matchup_row.find('a', class_='matchupLink')['href']
regex = re.compile(r'http://.+\?gameId=([\d]+)', re.IGNORECASE)
match = regex.match(game_href)
row['game_id'] = match.group(1)
# Date (month and day)
date = matchup_row.find('div', class_='pickem-date').string.strip()
date_parts = date.split(',')
month_and_day = ''
if len(date_parts) > 1:
month_and_day = date_parts[1].strip()
else:
month_and_day = date_parts[0].strip()
month_and_day_parts = month_and_day.split(' ')
month = month_and_day_parts[0].strip()
day = month_and_day_parts[1].strip()
if len(day) == 1:
day = '0' + day #pad with zero if necessary
# Time
#url = html_doc.xpath("//head/link[@rel='canonical']/@href").to_s
#year = url.match(/http:\/\/games.espn.com\/nfl-pigskin-pickem\/([\d]+)/)[1]
url = html_doc.find('link', rel='canonical')['href'].strip()
regex = re.compile(
r'http://games.espn.com/nfl-pigskin-pickem/([\d]+)', re.IGNORECASE)
match = regex.match(url)
# Year
season_start_year = int(match.group(1))
year = season_start_year
month_int = StripStats.month_abbr_to_int[month]
if month_int < EspnPickemStripStats.season_start_month:
year += 1
time = matchup_row.find('div', class_='pickem-time').string.strip()
if time[1] == ':':
time = '0' + time #pad with zero if necessary
date_string = month + ' ' + day + ' ' + str(year) + ' ' + time
datetime_obj_utc = StripStats.date_object_utc_from_string(date_string, 'US/Eastern')
row['datetime'] = datetime_obj_utc.strftime('%Y-%m-%d %H:%M:%S %z')
# Media network
row['network'] = matchup_row.find('div', class_='tvNetwork').string.strip()
# Load divs for each team into array
td_teams = matchup_row.find('td', class_='teams')
div_teams = td_teams.find_all('div', class_='pickem-teams')
# Away team points
row['away_team_points'] = div_teams[0].find('div', class_='away').string.strip()
# Home team points
row['home_team_points'] = div_teams[1].find('div', class_='home').string.strip()
# Away team 3-letter code
row['away_team'] = div_teams[1].find('button')['data-f'].strip()
# Home team 3-letter code
row['home_team'] = div_teams[1].find('button')['data-u'].strip()
# Spread
row['spread'] = div_teams[1].find('button')['data-s'].strip()
# Home team confidence %
td_picked = matchup_row.find('td', class_='picked')
divs = td_picked.find_all('div', class_='wpwOutsideWrapper')
row['confidence'] = divs[1].find('span').string.strip()
data.append(row)
return data
# Main program exectuion
URL_TEMPLATE = 'http://games.espn.com/nfl-pigskin-pickem/{}/en/entry?entryID={}&period={}'
OUTPUT_FILE = 'espn_pickem_python.csv'
YEAR_ZERO = 2012 #year when first week corresponds to period=1
WEEKS_PER_SEASON = 17
FIELDS = (
'matchup_id',
'game_id',
'datetime',
'network',
'away_team',
'home_team',
'away_team_points',
'home_team_points',
'spread',
'confidence',
)
open(OUTPUT_FILE, 'w').close() #truncate file
#START_YEAR = 2015
#ENTRY_IDS = [276612, 171981]
START_YEAR = 2016
ENTRY_IDS = [171981]
for i, entry_id in enumerate(ENTRY_IDS):
this_year = i + START_YEAR
start_period = (this_year - YEAR_ZERO) * WEEKS_PER_SEASON + 1
end_period = start_period + WEEKS_PER_SEASON - 1
print "Stripping ESPN Pick'Em data for " + str(this_year) + '...'
for period in range(start_period, end_period + 1):
url = URL_TEMPLATE.format(this_year, entry_id, period)
espn_pickem_strip_stats = EspnPickemStripStats(url, OUTPUT_FILE, FIELDS)
espn_pickem_strip_stats.write_data()
print url