strip-stats/espn_pickem_strip_stats.py

"""Strip stats from ESPN Pick'Em"""
from pprint import pprint
import re
#from datetime import datetime
import httplib2
#from pytz import timezone
#import pytz
import sys
try:
    from BeautifulSoup import BeautifulSoup
except ImportError:
    from bs4 import BeautifulSoup

from StripStats import StripStats

class EspnPickemStripStats(StripStats):
    """Override method get_data from StripStats class"""

    # Class variables
    season_start_month = 8 #needed to handle case when season starts and finishes in different years and the
                           #current year must be inferred

    # Override abstract method and return data extracted from "html_doc" object
    def strip_data(self, url):

        http = httplib2.Http()
        headers, body = http.request(url)

        # Load HTML into an object
        html_doc = BeautifulSoup(body, 'html.parser')

        # Initialize data attribute
        data = []

        # Pick'Em table
        pickem_table = html_doc.find('table', class_='pickemTable')

        # List of matchup rows
        matchup_rows = pickem_table.find_all('tr', class_='matchupRow')

        # Loop through rows
        for j in range(0, len(matchup_rows)):
            row = {}
            matchup_row = matchup_rows[j]

            # Matchup id
            row['matchup_id'] = matchup_row['data-matchupid']

            # Game id
            game_href = matchup_row.find('a', class_='matchupLink')['href']
            regex = re.compile(r'http://.+\?gameId=([\d]+)', re.IGNORECASE)
            match = regex.match(game_href)
            row['game_id'] = match.group(1)

            # Date (month and day)
            date = matchup_row.find('div', class_='pickem-date').string.strip()
            date_parts = date.split(',')
            month_and_day = ''
            if len(date_parts) > 1:
                month_and_day = date_parts[1].strip()
            else:
                month_and_day = date_parts[0].strip()
            month_and_day_parts = month_and_day.split(' ')
            month = month_and_day_parts[0].strip()
            day = month_and_day_parts[1].strip()
            if len(day) == 1:
                day = '0' + day #pad with zero if necessary

            # Time
            #url = html_doc.xpath("//head/link[@rel='canonical']/@href").to_s
            #year = url.match(/http:\/\/games.espn.com\/nfl-pigskin-pickem\/([\d]+)/)[1]
            url = html_doc.find('link', rel='canonical')['href'].strip()
            regex = re.compile(
                r'http://games.espn.com/nfl-pigskin-pickem/([\d]+)', re.IGNORECASE)
            match = regex.match(url)

            # Year
            season_start_year = int(match.group(1))
            year = season_start_year
            month_int = StripStats.month_abbr_to_int[month]
            if month_int < EspnPickemStripStats.season_start_month:
                year += 1

            time = matchup_row.find('div', class_='pickem-time').string.strip()
            if time[1] == ':':
                time = '0' + time #pad with zero if necessary
            date_string = month + ' ' + day + ' ' + str(year) + ' ' + time

            datetime_obj_utc = StripStats.date_object_utc_from_string(date_string, 'US/Eastern')
            row['datetime'] = datetime_obj_utc.strftime('%Y-%m-%d %H:%M:%S %z')

            # Media network
            row['network'] = matchup_row.find('div', class_='tvNetwork').string.strip()

            # Load divs for each team into array
            td_teams = matchup_row.find('td', class_='teams')
            div_teams = td_teams.find_all('div', class_='pickem-teams')

            # Away team points
            row['away_team_points'] = div_teams[0].find('div', class_='away').string.strip()

            # Home team points
            row['home_team_points'] = div_teams[1].find('div', class_='home').string.strip()

            # Away team 3-letter code
            row['away_team'] = div_teams[1].find('button')['data-f'].strip()

            # Home team 3-letter code
            row['home_team'] = div_teams[1].find('button')['data-u'].strip()

            # Spread
            row['spread'] = div_teams[1].find('button')['data-s'].strip()

            # Home team confidence %
            td_picked = matchup_row.find('td', class_='picked')
            divs = td_picked.find_all('div', class_='wpwOutsideWrapper')
            row['confidence'] = divs[1].find('span').string.strip()

            data.append(row)

        return data

# Main program exectuion
URL_TEMPLATE = 'http://games.espn.com/nfl-pigskin-pickem/{}/en/entry?entryID={}&period={}'
OUTPUT_FILE = 'espn_pickem_python.csv'
YEAR_ZERO = 2012 #year when first week corresponds to period=1
WEEKS_PER_SEASON = 17
FIELDS = (
    'matchup_id',
    'game_id',
    'datetime',
    'network',
    'away_team',
    'home_team',
    'away_team_points',
    'home_team_points',
    'spread',
    'confidence',
)

open(OUTPUT_FILE, 'w').close() #truncate file

#START_YEAR = 2015
#ENTRY_IDS = [276612, 171981]
START_YEAR = 2016
ENTRY_IDS = [171981]
for i, entry_id in enumerate(ENTRY_IDS):
    this_year = i + START_YEAR
    start_period = (this_year - YEAR_ZERO) * WEEKS_PER_SEASON + 1
    end_period = start_period + WEEKS_PER_SEASON - 1
    print "Stripping ESPN Pick'Em data for " + str(this_year) + '...'
    for period in range(start_period, end_period + 1):
        url = URL_TEMPLATE.format(this_year, entry_id, period)
        espn_pickem_strip_stats = EspnPickemStripStats(url, OUTPUT_FILE, FIELDS)
        espn_pickem_strip_stats.write_data()
        print url