import StripStats strip_stats = StripStats( 'out.csv', url, ( 'matchup_id', 'game_id', 'datetime', 'network', 'away_team', 'away_team_points', 'home_team', 'home_team_points', 'spread', 'confidence', ) ) #!/usr/local/bin/python # Development #import sys #from pprint import pprint import csv import re from datetime import datetime from pytz import timezone import httplib2 try: from BeautifulSoup import BeautifulSoup except ImportError: from bs4 import BeautifulSoup GAMES_PER_SEASON = 17 YEAR = 2016 FIRST_GAME_ID = 69 URL_TEMPLATE = 'http://games.espn.com/nfl-pigskin-pickem/2016/en/entry?entryID=171981&period={}' FIELDS = ( 'matchup_id', 'game_id', 'datetime', 'network', 'away_team', 'away_team_points', 'home_team', 'home_team_points', 'spread', 'confidence', ) # Open csv file for writing CSV_FILE = 'espn_pickem.csv' def create_or_truncate_file(path): open(path, 'w').close() def is_file_empty(path): try: with open(path, 'r') as f: for line in f: if not line.strip(): return False break except: pass return True def dict_writer(): return csv.DictWriter(csvfile, FIELDS, extrasaction='raise', dialect='excel') create_or_truncate_file(CSV_FILE) with open(CSV_FILE, 'wb') as csvfile: #mywriter = csv.DictWriter(csvfile, FIELDS, extrasaction='raise', dialect='excel') mywriter = dict_writer() if is_file_empty(CSV_FILE): mywriter.writeheader() for i in range(0, GAMES_PER_SEASON): # Load html from url into variable url = URL_TEMPLATE.format(FIRST_GAME_ID + i) http = httplib2.Http() headers, body = http.request(url) # Load HTML into an object #soup = BeautifulSoup(html) soup = BeautifulSoup(body, 'html.parser') # Pick'Em table pickem_table = soup.find('table', class_='pickemTable') # List of matchup rows matchup_rows = pickem_table.find_all('tr', class_='matchupRow') # Loop through rows for j in range(0, len(matchup_rows)): matchup_row = matchup_rows[j] # Dictionary for data data = {} # Matchup id data['matchup_id'] = matchup_row['data-matchupid'] # Game id game_href = matchup_row.find('a', class_='matchupLink')['href'] p = re.compile(r'http://.+\?gameId=([\d]+)', re.IGNORECASE) m = p.match(game_href) data['game_id'] = m.group(1) # Date date = matchup_row.find('div', class_='pickem-date').string.strip() date_parts = date.split(',') month_and_day = '' if len(date_parts) > 1: month_and_day = date_parts[1].strip() else: month_and_day = date_parts[0].strip() month_and_day_parts = month_and_day.split(' ') month = month_and_day_parts[0].strip() day = month_and_day_parts[1].strip() if len(day) == 1: day = '0' + day #pad with zero if necessary # Time time = matchup_row.find('div', class_='pickem-time').string.strip() if time[1] == ':': time = '0' + time #pad with zero if necessary datetime_obj_naive = datetime.strptime( month + ' ' + day + ' ' + str(YEAR) + ' ' + time, '%b %d %Y %H:%M%p') datetime_obj_eastern = timezone('US/Eastern').localize(datetime_obj_naive) data['datetime'] = datetime_obj_eastern.strftime('%Y-%m-%d %H:%M:%S %Z%z') # Media network data['network'] = matchup_row.find('div', class_='tvNetwork').string.strip() # Load divs for each team into array td_teams = matchup_row.find('td', class_='teams') div_teams = td_teams.find_all('div', class_='pickem-teams') # Away team points data['away_team_points'] = div_teams[0].find('div', class_='away').string.strip() # Home team points data['home_team_points'] = div_teams[1].find('div', class_='home').string.strip() # Away team 3-letter code data['away_team'] = div_teams[1].find('button')['data-f'].strip() # Home team 3-letter code data['home_team'] = div_teams[1].find('button')['data-u'].strip() # Spread data['spread'] = div_teams[1].find('button')['data-s'].strip() # Home team confidence % td_picked = matchup_row.find('td', class_='picked') divs = td_picked.find_all('div', class_='wpwOutsideWrapper') data['confidence'] = divs[1].find('span').string.strip() # Insert row of data into csv file mywriter.writerow(data)