strip-stats/espn_pickem_strip_stats.py
2017-08-25 16:16:33 -05:00

147 lines
5.1 KiB
Python

"""Strip stats from ESPN Pick'Em"""
from pprint import pprint
import re
from datetime import datetime
import httplib2
from pytz import timezone
import pytz
import sys
try:
from BeautifulSoup import BeautifulSoup
except ImportError:
from bs4 import BeautifulSoup
from StripStats import StripStats
class EspnPickemStripStats(StripStats):
"""Override method get_data from StripStats class"""
# Override abstract method and return data extracted from "html_doc" object
def strip_data(self, url):
http = httplib2.Http()
headers, body = http.request(url)
# Load HTML into an object
html_doc = BeautifulSoup(body, 'html.parser')
# Initialize data attribute
data = []
# Pick'Em table
pickem_table = html_doc.find('table', class_='pickemTable')
# List of matchup rows
matchup_rows = pickem_table.find_all('tr', class_='matchupRow')
# Loop through rows
for j in range(0, len(matchup_rows)):
row = {}
matchup_row = matchup_rows[j]
# Matchup id
row['matchup_id'] = matchup_row['data-matchupid']
# Game id
game_href = matchup_row.find('a', class_='matchupLink')['href']
regex = re.compile(r'http://.+\?gameId=([\d]+)', re.IGNORECASE)
match = regex.match(game_href)
row['game_id'] = match.group(1)
# Date
date = matchup_row.find('div', class_='pickem-date').string.strip()
date_parts = date.split(',')
month_and_day = ''
if len(date_parts) > 1:
month_and_day = date_parts[1].strip()
else:
month_and_day = date_parts[0].strip()
month_and_day_parts = month_and_day.split(' ')
month = month_and_day_parts[0].strip()
day = month_and_day_parts[1].strip()
if len(day) == 1:
day = '0' + day #pad with zero if necessary
# Time
#url = html_doc.xpath("//head/link[@rel='canonical']/@href").to_s
#year = url.match(/http:\/\/games.espn.com\/nfl-pigskin-pickem\/([\d]+)/)[1]
url = html_doc.find('link', rel='canonical')['href'].strip()
regex = re.compile(
r'http://games.espn.com/nfl-pigskin-pickem/([\d]+)', re.IGNORECASE)
match = regex.match(url)
year = match.group(1)
time = matchup_row.find('div', class_='pickem-time').string.strip()
if time[1] == ':':
time = '0' + time #pad with zero if necessary
date_string = month + ' ' + day + ' ' + str(year) + ' ' + time
datetime_obj_naive = datetime.strptime(date_string, '%b %d %Y %I:%M%p')
datetime_obj_eastern = timezone('US/Eastern').localize(datetime_obj_naive)
datetime_obj_utc = datetime_obj_eastern.astimezone(pytz.UTC)
row['datetime'] = datetime_obj_utc.strftime('%Y-%m-%d %H:%M:%S %z')
# Media network
row['network'] = matchup_row.find('div', class_='tvNetwork').string.strip()
# Load divs for each team into array
td_teams = matchup_row.find('td', class_='teams')
div_teams = td_teams.find_all('div', class_='pickem-teams')
# Away team points
row['away_team_points'] = div_teams[0].find('div', class_='away').string.strip()
# Home team points
row['home_team_points'] = div_teams[1].find('div', class_='home').string.strip()
# Away team 3-letter code
row['away_team'] = div_teams[1].find('button')['data-f'].strip()
# Home team 3-letter code
row['home_team'] = div_teams[1].find('button')['data-u'].strip()
# Spread
row['spread'] = div_teams[1].find('button')['data-s'].strip()
# Home team confidence %
td_picked = matchup_row.find('td', class_='picked')
divs = td_picked.find_all('div', class_='wpwOutsideWrapper')
row['confidence'] = divs[1].find('span').string.strip()
data.append(row)
return data
# Main program exectuion
URL_TEMPLATE = 'http://games.espn.com/nfl-pigskin-pickem/{}/en/entry?entryID={}&period={}'
OUTPUT_FILE = 'espn_pickem_python.csv'
YEAR_ZERO = 2012 #year when first week corresponds to period=1
WEEKS_PER_SEASON = 17
FIELDS = (
'matchup_id',
'game_id',
'datetime',
'network',
'away_team',
'home_team',
'away_team_points',
'home_team_points',
'spread',
'confidence',
)
open(OUTPUT_FILE, 'w').close() #truncate file
# 2015 and 2016
START_YEAR = 2015
ENTRY_IDS = (276612, 171981)
for i, entry_id in enumerate(ENTRY_IDS):
this_year = i + START_YEAR
start_period = (this_year - YEAR_ZERO) * WEEKS_PER_SEASON + 1
print "Stripping ESPN Pick'Em data for " + str(this_year) + '...'
end_period = start_period + WEEKS_PER_SEASON - 1
for period in range(start_period, end_period + 1):
url = URL_TEMPLATE.format(this_year, entry_id, period)
espn_pickem_strip_stats = EspnPickemStripStats(url, OUTPUT_FILE, FIELDS)
espn_pickem_strip_stats.write_data()
print url