Can someone explain this code to me please

python

#1

The code works and i know what its supposed to do but i dont know exactly what all this code does… and what some of it means eg “unfussy reader” what does that do or mean?..

import numpy as np
import csv
import os
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties

acceses each column in the array

class BattingData:
year = 0
player_id = 1
first_name = 2
last_name = 3
team_name = 4
games = 5
at_bats = 6
runs = 7
hits = 8
doubles = 9
triples = 10
home_runs = 11
rbi = 12
walks = 13
hbp = 14
stolen_bases = 15
caught_stealing = 16
strike_outs = 17
sac_flies = 18
position = 19

def unfussy_reader(reader):
while True:
try:
yield next(reader)
except csv.Error:
continue

2b: provides a path to my folder called data.

reads the batting data

def read_data():
data_path = os.getcwd() + “/data” # This method returns current working directory of a process.
contents = []
for filename in os.listdir(data_path):
with open(data_path + ‘/’ + filename, ‘r’, newline=’’, errors=‘ignore’, encoding=‘utf-8’) as csvfile: # opens the files
reader = unfussy_reader(csv.reader(csvfile))
row_num = 0
for row in reader:
if row_num != 0:
contents.append(row)
row_num += 1
return np.array(contents)

creates a histogram displaying the number of runs scored in the lifetimes of all players.

def histogram_no_positions(data):
print(‘Histogram of runs scored in the lifetimes of all players (no cutoff, no positions)’)
players = data[:, BattingData.player_id]
ids = np.unique(players)
runs = {}
for player in ids:
runs[player] = 0
for item in data:
runs[item[BattingData.player_id]] += item[BattingData.runs].astype(np.int32)
values = list(runs.values())
plt.hist(values, bins=100)
plt.xlabel(‘Number of total runs’)
plt.ylabel(‘Number of players’)
plt.title(‘Total runs vs number of players having that many runs’)
plt.show()

def lifetime_runs_by_position(data, cutoff=0):
runs = {}
for item in data:
if not item[BattingData.position] in runs:
runs[item[BattingData.position]] = {}
if not item[BattingData.player_id] in runs[item[BattingData.position]]:
runs[item[BattingData.position]][item[BattingData.player_id]] = 0
runs[item[BattingData.position]][item[BattingData.player_id]] += item[BattingData.runs].astype(np.int32)
keys = []
values = []
for key in sorted(runs):
l = list(runs[key].values())
values.append([i for i in l if i >= cutoff])
keys.append(key)
return (keys, values)

def histogram_no_cutoff(data):
print(‘Histogram of runs scored in the lifetimes of all players by positions (no cutoff)’)
players = data[:, BattingData.player_id]
ids = np.unique(players)
keys, values = lifetime_runs_by_position(data, 0)
plt.hist(values, bins=100, label=keys, stacked=True)
plt.legend(prop={‘size’: 10})
plt.xlabel(‘Number of total runs by position’)
plt.ylabel(‘Number of players’)
plt.title(‘Total runs vs number of players having that many runs by position’)
plt.show()

def histogram_cutoff_100(data):
print(“Histogram of runs scored in the lifetimes of all players (cutoff = 100)”)
players = data[:, BattingData.player_id]
ids = np.unique(players)
keys, values = lifetime_runs_by_position(data, 100)
plt.hist(values, bins=100, label=keys, stacked=True)
plt.legend(prop={‘size’: 10})
plt.xlabel(‘Number of total runs by position’)
plt.ylabel(‘Number of players’)
plt.title(‘Total runs vs number of players having that many runs by position (cutoff=100)’)
plt.show()

def team_presence(data):
print(“Graph team presence over time”)
years = {}
for item in data:
if not item[BattingData.team_name] in years:
years[item[BattingData.team_name]] = []
years[item[BattingData.team_name]].append(item[BattingData.year].astype(np.int32))

teams = []
for i, team in enumerate(sorted(years)):
    x = np.unique(years[team])
    plt.scatter(x, [i] * len(x), s=6, marker='s')
    teams.append(team)
plt.yticks(range(len(years)), teams, fontproperties=FontProperties(size=5))
plt.xticks(fontproperties=FontProperties(size=7))
plt.xlabel('Year')
plt.ylabel('Team names')
plt.title('Team presence by year')
plt.show()

def best_worst_stealing_bases(data):
print(“The batters that had the best and worst stealing bases”)
players = data
games = players[:, BattingData.games].astype(np.int32)
games_25th = np.percentile(games, 25)
players = players[games > games_25th]

games = players[:, BattingData.games].astype(np.int32)
bases_stolen = players[:, BattingData.stolen_bases].astype(np.int32)
caught_stealing = players[:, BattingData.caught_stealing].astype(np.int32)

steal_ratio = bases_stolen / games
steal_difference = caught_stealing - bases_stolen
best = np.argmax(steal_ratio)
worst = np.argmax(steal_difference)

print("Best Stealer:")
print("{} {} ({}) : games played - {}, bases stolen - {}".format(
    players[best][BattingData.first_name], 
    players[best][BattingData.last_name],
    players[best][BattingData.year],
    games[best],
    bases_stolen[best]))

print("\nWorst Stealer:")
print("{} {} ({}) : games played - {}, bases stolen - {}, bases caught stealing - {}".format(
    players[worst][BattingData.first_name], 
    players[worst][BattingData.last_name],
    players[worst][BattingData.year],
    games[worst],
    bases_stolen[worst],
    caught_stealing[worst]))

def get_n_best(data, n):
return np.argpartition(data, -n)[-n:]

def calculate_on_base_percentage(data):
data = data[np.unique(
np.core.defchararray.add(np.core.defchararray.add(data[:, BattingData.player_id], data[:, BattingData.year]), data[:, BattingData.team_name]),
return_index=True)[1]]
at_bats = data[:, BattingData.at_bats].astype(np.int32)
games = data[:, BattingData.games].astype(np.int32)
at_bats_25th = np.percentile(at_bats, 25)
games_25th = np.percentile(games, 25)
data = data[(at_bats > at_bats_25th) & (games > games_25th)]

players = {}
for player in data:
    id = player[BattingData.player_id] + player[BattingData.year]
    if not id in players:
        players[id] = {
            "info": player[0:4],
            "games": 0,
            "hits": 0,
            "walks": 0,
            "hbp": 0,
            "at_bats": 0,
            "sac_flies": 0
        }
    players[id]['games'] += player[BattingData.games].astype(np.int32)
    players[id]['hits'] += player[BattingData.hits].astype(np.int32)
    players[id]['walks'] += player[BattingData.walks].astype(np.int32)
    players[id]['hbp'] += player[BattingData.hbp].astype(np.int32)
    players[id]['at_bats'] += player[BattingData.at_bats].astype(np.int32)
    try:
        players[id]['sac_flies'] += player[BattingData.sac_flies].astype(np.int32)
    except:
        pass

for i in players:
    player = players[i]
    player['on_base_percentage'] = ((player['hits'] + player['walks'] + player['hbp']) /  
    (player['at_bats'] + player['walks'] + player['hbp'] + player['sac_flies']))

return players

def best_n_season_by_onbase_percentage(players, n):
ids = np.array(list(players.keys()))
best_n = ids[get_n_best([players[i][‘on_base_percentage’] for i in ids], n)]
best_n_players = [players[id] for id in best_n]
return best_n_players

def best_20_seaons_by_onbase_percentage(data):
print(‘10 best seasons that batters had by on base percentage’)
players = calculate_on_base_percentage(data)
for i in best_n_season_by_onbase_percentage(players, 20):
print(’{} {} ({}): {:.4f}’.format(
i[“info”][BattingData.first_name],
i[“info”][BattingData.last_name],
i[“info”][BattingData.year],
i[‘on_base_percentage’]))

def plot_best_seasons_onbase_percentage(data):
print(‘On base percentage for the lifetimes of players with best seasons’)
players = calculate_on_base_percentage(data)
best_20 = best_n_season_by_onbase_percentage(players, 20)
ids = []
for player in best_20:
if not player[“info”][BattingData.player_id] in ids:
ids.append(player[“info”][BattingData.player_id])

ids = ['bondsba01', 'cashno01', 'kinkami01', 'lowerte01',
'mantlmi01', 'mcgwima01', 'motama01', 'ramirma02', 'strawda01',
'thomafr04',  'willite01']
for id in sorted(ids):
    x = []
    y = []
    label = ""
    for i in players:
        if i[:-4] == id:
            player = players[i]
            x.append(player["info"][BattingData.year])
            y.append(player["on_base_percentage"])
            label = '{} {}'.format(player["info"][BattingData.first_name], player["info"][BattingData.last_name])
    plt.plot(x, y, label=label)

plt.xlabel("Year")
plt.ylabel("On base percentage")
plt.title("Time vs on base percentage of players with best seasons")
plt.legend(prop={'size': 9}, bbox_to_anchor=(1, 1))
plt.show()

def homeruns_percentiles(data):
print(‘Homeruns over time (percentiles)’)
players = data
years = {}
for player in players:
year = player[BattingData.year].astype(np.int32)
if not year in years:
years[year] = []
years[year].append(player[BattingData.home_runs].astype(np.int32))

x = []
y_max = []
y_99 = []
y_50 = []
for year in years:
    x.append(year)
    y_max.append(np.max(years[year]))
    y_99.append(np.percentile(years[year], 99))
    y_50.append(np.percentile(years[year], 50))

plt.bar(x, y_max, label = "Maximum")
plt.bar(x, y_99, label = "99th Percentile")
plt.bar(x, y_50, label = "50th Percentile")
plt.legend(prop={'size': 10}, loc=9)
plt.xlabel('Year')
plt.ylabel('Number of homeruns')
plt.title('Homeruns per year by player')
plt.show()

def average_team_rbi(data):
print(‘Average team RBI over time (1950 - 1959)’)
teams = {}
years = data[:, BattingData.year].astype(np.int32)
players = data[(years >= 1950) & (years < 1960)]

for player in players:
    team = player[BattingData.team_name]
    if not team in teams:
        teams[team] = {}
    year = player[BattingData.year].astype(np.int32)
    if not year in teams[team]:
        teams[team][year] = 0
    teams[team][year] += player[BattingData.rbi].astype(np.int32)

for team in sorted(teams):
    x = sorted(teams[team])
    y = [teams[team][i] for i in x]
    plt.plot(x, y, label=team)

plt.xlabel('Year')
plt.ylabel('Total RBI')
plt.legend(prop={'size': 9}, bbox_to_anchor=(1, 1))
plt.title('Year vs total RBI per team')
plt.show()

def extra_credit(data):
print(‘Extra Credit’)

def get_menu_choice(choices):
for i, choice in enumerate(choices):
print(str(i) + ‘:’, choice)
print()
while True:
try:
choice = int(input("> "))
if choice >= 0 and choice < len(choices):
return choice
else:
print(“Invalid choice ( 0 -”, len(choices)-1, ‘)’)
except ValueError:
print(“Invalid choice ( 0 -”, len(choices) - 1, ‘)’)

def main():
data = read_data()
assert data.dtype.type is np.str_
for row in data:
for item in row:
assert not row is None

choices = [
    'Exit',
    'Histogram of runs scored in the lifetimes of all players (no cutoff, no positions)',
    'Histogram of runs scored in the lifetimes of all players (no cutoff)',
    'Histogram of runs scored in the lifetimes of all players (cutoff = 100)',
    'Graph team presence over time',
    'Find the batters that had the best and worst stealing bases',
    'List the 10 best seasons that batters had by on base percentage',
    'Plot on base percentage for the lifetimes of players with best seasons',
    'Plot homeruns over time (percentiles)',
    'Plot average team RBI over time (1950 - 1959)',
    'Extra Credit'
]

operations = {}
operations[1] = histogram_no_positions
operations[2] = histogram_no_cutoff
operations[3] = histogram_cutoff_100
operations[4] = team_presence
operations[5] = best_worst_stealing_bases
operations[6] = best_20_seaons_by_onbase_percentage
operations[7] = plot_best_seasons_onbase_percentage
operations[8] = homeruns_percentiles
operations[9] = average_team_rbi
operations[10] = extra_credit

while True:
    print()
    choice = get_menu_choice(choices)
    if choice == 0:
        break
    else:
        operations[choice](data)
        input("...")

main()


#2

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.