157 lines
5.5 KiB
Python
157 lines
5.5 KiB
Python
"""Strip stats from ESPN Pick'Em"""
|
|
from pprint import pprint
|
|
import re
|
|
#from datetime import datetime
|
|
import httplib2
|
|
#from pytz import timezone
|
|
#import pytz
|
|
import sys
|
|
try:
|
|
from BeautifulSoup import BeautifulSoup
|
|
except ImportError:
|
|
from bs4 import BeautifulSoup
|
|
|
|
from StripStats import StripStats
|
|
|
|
class EspnPickemStripStats(StripStats):
|
|
"""Override method get_data from StripStats class"""
|
|
|
|
# Class variables
|
|
season_start_month = 8 #needed to handle case when season starts and finishes in different years and the
|
|
#current year must be inferred
|
|
|
|
# Override abstract method and return data extracted from "html_doc" object
|
|
def strip_data(self, url):
|
|
|
|
http = httplib2.Http()
|
|
headers, body = http.request(url)
|
|
|
|
# Load HTML into an object
|
|
html_doc = BeautifulSoup(body, 'html.parser')
|
|
|
|
# Initialize data attribute
|
|
data = []
|
|
|
|
# Pick'Em table
|
|
pickem_table = html_doc.find('table', class_='pickemTable')
|
|
|
|
# List of matchup rows
|
|
matchup_rows = pickem_table.find_all('tr', class_='matchupRow')
|
|
|
|
# Loop through rows
|
|
for j in range(0, len(matchup_rows)):
|
|
row = {}
|
|
matchup_row = matchup_rows[j]
|
|
|
|
# Matchup id
|
|
row['matchup_id'] = matchup_row['data-matchupid']
|
|
|
|
# Game id
|
|
game_href = matchup_row.find('a', class_='matchupLink')['href']
|
|
regex = re.compile(r'http://.+\?gameId=([\d]+)', re.IGNORECASE)
|
|
match = regex.match(game_href)
|
|
row['game_id'] = match.group(1)
|
|
|
|
# Date (month and day)
|
|
date = matchup_row.find('div', class_='pickem-date').string.strip()
|
|
date_parts = date.split(',')
|
|
month_and_day = ''
|
|
if len(date_parts) > 1:
|
|
month_and_day = date_parts[1].strip()
|
|
else:
|
|
month_and_day = date_parts[0].strip()
|
|
month_and_day_parts = month_and_day.split(' ')
|
|
month = month_and_day_parts[0].strip()
|
|
day = month_and_day_parts[1].strip()
|
|
if len(day) == 1:
|
|
day = '0' + day #pad with zero if necessary
|
|
|
|
# Time
|
|
#url = html_doc.xpath("//head/link[@rel='canonical']/@href").to_s
|
|
#year = url.match(/http:\/\/games.espn.com\/nfl-pigskin-pickem\/([\d]+)/)[1]
|
|
url = html_doc.find('link', rel='canonical')['href'].strip()
|
|
regex = re.compile(
|
|
r'http://games.espn.com/nfl-pigskin-pickem/([\d]+)', re.IGNORECASE)
|
|
match = regex.match(url)
|
|
|
|
# Year
|
|
season_start_year = int(match.group(1))
|
|
year = season_start_year
|
|
month_int = StripStats.month_abbr_to_int[month]
|
|
if month_int < EspnPickemStripStats.season_start_month:
|
|
year += 1
|
|
|
|
time = matchup_row.find('div', class_='pickem-time').string.strip()
|
|
if time[1] == ':':
|
|
time = '0' + time #pad with zero if necessary
|
|
date_string = month + ' ' + day + ' ' + str(year) + ' ' + time
|
|
|
|
datetime_obj_utc = StripStats.date_object_utc_from_string(date_string, 'US/Eastern')
|
|
row['datetime'] = datetime_obj_utc.strftime('%Y-%m-%d %H:%M:%S %z')
|
|
|
|
# Media network
|
|
row['network'] = matchup_row.find('div', class_='tvNetwork').string.strip()
|
|
|
|
# Load divs for each team into array
|
|
td_teams = matchup_row.find('td', class_='teams')
|
|
div_teams = td_teams.find_all('div', class_='pickem-teams')
|
|
|
|
# Away team points
|
|
row['away_team_points'] = div_teams[0].find('div', class_='away').string.strip()
|
|
|
|
# Home team points
|
|
row['home_team_points'] = div_teams[1].find('div', class_='home').string.strip()
|
|
|
|
# Away team 3-letter code
|
|
row['away_team'] = div_teams[1].find('button')['data-f'].strip()
|
|
|
|
# Home team 3-letter code
|
|
row['home_team'] = div_teams[1].find('button')['data-u'].strip()
|
|
|
|
# Spread
|
|
row['spread'] = div_teams[1].find('button')['data-s'].strip()
|
|
|
|
# Home team confidence %
|
|
td_picked = matchup_row.find('td', class_='picked')
|
|
divs = td_picked.find_all('div', class_='wpwOutsideWrapper')
|
|
row['confidence'] = divs[1].find('span').string.strip()
|
|
|
|
data.append(row)
|
|
|
|
return data
|
|
|
|
# Main program exectuion
|
|
URL_TEMPLATE = 'http://games.espn.com/nfl-pigskin-pickem/{}/en/entry?entryID={}&period={}'
|
|
OUTPUT_FILE = 'espn_pickem_python.csv'
|
|
YEAR_ZERO = 2012 #year when first week corresponds to period=1
|
|
WEEKS_PER_SEASON = 17
|
|
FIELDS = (
|
|
'matchup_id',
|
|
'game_id',
|
|
'datetime',
|
|
'network',
|
|
'away_team',
|
|
'home_team',
|
|
'away_team_points',
|
|
'home_team_points',
|
|
'spread',
|
|
'confidence',
|
|
)
|
|
|
|
open(OUTPUT_FILE, 'w').close() #truncate file
|
|
|
|
#START_YEAR = 2015
|
|
#ENTRY_IDS = [276612, 171981]
|
|
START_YEAR = 2016
|
|
ENTRY_IDS = [171981]
|
|
for i, entry_id in enumerate(ENTRY_IDS):
|
|
this_year = i + START_YEAR
|
|
start_period = (this_year - YEAR_ZERO) * WEEKS_PER_SEASON + 1
|
|
end_period = start_period + WEEKS_PER_SEASON - 1
|
|
print "Stripping ESPN Pick'Em data for " + str(this_year) + '...'
|
|
for period in range(start_period, end_period + 1):
|
|
url = URL_TEMPLATE.format(this_year, entry_id, period)
|
|
espn_pickem_strip_stats = EspnPickemStripStats(url, OUTPUT_FILE, FIELDS)
|
|
espn_pickem_strip_stats.write_data()
|
|
print url
|