import pandas as pd
import matplotlib.pyplot as plt
import re
import time
from itertools import cycle


# Read in player data
player_data = pd.read_csv("player_data.csv")

player_data


# Import packages
from bs4 import BeautifulSoup
import requests

# Initialize link, dataframe, and columns of the dataframe
link = "https://www.basketball-reference.com/players/"
columns = ["Name", "G", "Date", "Age", "Tm", "A/H", "Opp", "Res", "GS", "MP", "FG", 
           "FGA", "FG%", "3P", "3PA", "3P%", "FT", "FTA", "FT%", "ORB", "DRB", "TRB", "AST", "STL", "BLK", "TOV", "PF", "PTS", 
           "GmSc", "+/-"]

df = pd.DataFrame(columns = columns)


# Loop through every player's data page
visited = set()
length = len(player_data["Player"].unique())


for i, line in player_data.iterrows():
    
    #print(line["Player"])
    if line["Player"] not in visited:
        player_id = line["Player-additional"]
        # Get the link for the player's individual stats for 2021-2022 season
        player_link = link + player_id[0] + "/" + player_id + "/gamelog/2022"

        # Use BeautifulSoup to get important contents
        result = requests.get(player_link)
        rc = result.content
        soup = BeautifulSoup(rc, "lxml")

        row = soup.find_all("tr", {"id": re.compile("pgl_basic\.[0-9]+")})
        if len(row) > 0:
            visited.add(line["Player"])
            
            for day in row:
                data_html = day.find_all("td")
                data = ["".join(item.strings) for item in data_html]
                data.insert(0, line["Player"])

                df.loc[len(df)] = data

        time.sleep(5)

#print(len(df["Name"].unique()))

# Read the data to a csv
df.to_csv("individual_data.csv")
df


# Now, we don't need to scrape the data. We can read it in.

df = pd.read_csv("individual_data.csv")

df


# First, Let's clean the data

df[["G", "GS", "FG", "FGA", "3P", "3PA", "FT", "FTA", "ORB", "DRB", "TRB", "AST", "STL", "BLK", "TOV", "PF", "PTS"]] =( 
    df[["G", "GS", "FG", "FGA", "3P", "3PA", "FT", "FTA", "ORB", "DRB", "TRB", "AST", "STL", "BLK", "TOV", "PF", "PTS"]].astype(int))

#df1[["FG%", "3P%", "FT%", "GmSc"]] = df1[["FG%", "3P%", "FT%", "GmSc"]].astype(float)
df["Date"] = pd.to_datetime(df["Date"])

def replace(char):
    if char == "@":
        return "A"
    else:
        return "H"
    
df["A/H"] = df["A/H"].apply(lambda x: replace(x))

df


# Now, let's create a new column describing fantasy points
df["Fantasy_Points"] = 0
df["Fantasy_Points"] = (df["PTS"] - df["TOV"] * 2 + df["BLK"] * 4 + df["STL"] * 4 + df["AST"] * 2 + df["TRB"] + df["3P"]
                        - df["FTA"] + df["FT"] - df["FGA"] + 2 * df["FG"])

df


# Let's first visualize the data

fig, ax = plt.subplots(figsize = (20, 20))

plt.title("Histogram of fantasy points across the season", fontsize = 35)
plt.xlabel("Fantasy Points", fontsize = 25)
plt.ylabel("Counts", fontsize = 25)

plt.hist(df["Fantasy_Points"], bins = [0, 10, 20, 30, 40, 50, 60, 111])
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)

(array([   0., 1000., 2000., 3000., 4000., 5000., 6000., 7000.]),
 [Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, '')])


# Initialize the figure
fig, ax = plt.subplots(figsize = (20, 20))

starter_dataset = df.loc[df["GS"] == 1]

# Create title and labels for plot
plt.title("Average Fantasy Points vs Standard Deviation against each team", fontsize = 35)
plt.xlabel("Average", fontsize = 25)
plt.ylabel("Standard Deviation", fontsize = 25)

# Group each Opposing team by Fantasy Points
average_points = starter_dataset.groupby("Opp")["Fantasy_Points"].mean().to_frame()
std_points = starter_dataset.groupby("Opp")["Fantasy_Points"].std().to_frame()
average_points = average_points.rename(columns = {"Fantasy_Points": "Mean"})
std_points = std_points.rename(columns = {"Fantasy_Points": "Std"})

avg_dataset = pd.concat([average_points, std_points], axis = 1)

# Plot Mean and row for each team
for i, row in avg_dataset.iterrows():
    
    ax.annotate(i, (row["Mean"], row["Std"]), fontsize = 15)
    ax.plot(row["Mean"], row["Std"], marker = 'o')

# Increase tick size for each team
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)

(array([14. , 14.5, 15. , 15.5, 16. , 16.5, 17. , 17.5]),
 [Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, '')])


fig, ax = plt.subplots(figsize = (20, 20))

starter_dataset = df.loc[df["GS"] == 1]

plt.title("Average vs Standard Deviation for each team", fontsize = 35)
plt.xlabel("Average", fontsize = 25)
plt.ylabel("Standard Deviation", fontsize = 25)

average_points = starter_dataset.groupby("Tm")["Fantasy_Points"].mean().to_frame()
std_points = starter_dataset.groupby("Tm")["Fantasy_Points"].std().to_frame()
average_points = average_points.rename(columns = {"Fantasy_Points": "Mean"})
std_points = std_points.rename(columns = {"Fantasy_Points": "Std"})

avg_dataset = pd.concat([average_points, std_points], axis = 1)

for i, row in avg_dataset.iterrows():
    
    ax.annotate(i, (row["Mean"], row["Std"]), fontsize = 15)
    ax.plot(row["Mean"], row["Std"], marker = 'o')
    
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)

(array([13., 14., 15., 16., 17., 18., 19., 20., 21.]),
 [Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, '')])


lakers_data = df.loc[df["Tm"] == "LAL"]
lakers_data = lakers_data.loc[lakers_data["GS"] == 1]

lakers_data


'''
In our data, we have rows of data which represents certain games for each player. In order to measure player contribution for a 
we want to see whether the total contributions add up to eliminate within player bias, or performance bias between games.

In addition, we will find the standard deviations for each individual starter. It shows consistency throughout the season.
'''

fig, ax = plt.subplots(figsize = (20, 20))

starter_dataset = df.loc[df["GS"] == 1]

plt.title("Average vs Standard Deviation for each Laker", fontsize = 35)
plt.xlabel("Average", fontsize = 25)
plt.ylabel("Standard Deviation", fontsize = 25)

season_averages = lakers_data.groupby("Name")["Fantasy_Points"].mean().to_frame()
season_std = lakers_data.groupby("Name")["Fantasy_Points"].std().to_frame()
season_averages = season_averages.rename(columns = {"Fantasy_Points":"Mean"})
season_std = season_std.rename(columns = {"Fantasy_Points":"Std"})

avg_dataset = pd.concat([season_averages, season_std], axis = 1)

for i, row in avg_dataset.iterrows():
    
    ax.annotate(i, (row["Mean"], row["Std"]), fontsize = 15)
    ax.plot(row["Mean"], row["Std"], marker = 'o')
    
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)

(array([ 0.,  2.,  4.,  6.,  8., 10., 12., 14., 16., 18., 20.]),
 [Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, '')])


lakers_deviation = avg_dataset["Mean"].std()
lakers_deviation

12.890956385772803


starter_data = df.loc[df["GS"] == 1]

season_mean = starter_data.groupby("Name")["Fantasy_Points"].mean().to_frame()
population_deviation = season_mean["Fantasy_Points"].std()

population_deviation

9.646586529179343


sample_size = len(lakers_data["Name"].unique())
sample_size

16


chi_value = (16 - 1) * (lakers_deviation ** 2) / (population_deviation ** 2)

chi_value

26.786391292061595


player_list = lakers_data["Name"].unique()

colors = cycle(["purple", "darkgreen", "blue", "pink", "brown", "red", "teal", "orange", "navy", \
          "darkturquoise", "black", "maroon", "tan", "indigo", "goldenrod", "olive"])
fig, ax = plt.subplots(figsize = (20, 20))

plt.title("Fantasy Points of Lakers starters over time", fontsize = 35)
plt.ylabel("Fantasy Points", fontsize = 25)
plt.xlabel("Time (Date)", fontsize = 25)

for player in player_list:
    
    player_data = lakers_data.loc[lakers_data["Name"] == player]
    plt.plot(player_data["Date"], player_data["Fantasy_Points"], marker = 'o', label = player, color = next(colors))

plt.legend()

plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)

(array([-20.,   0.,  20.,  40.,  60.,  80., 100.]),
 [Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, '')])


import numpy as np
lakers_data['Date'] = pd.to_datetime(lakers_data['Date']).astype(np.int64)
df1 = lakers_data[['Name', 'Date', 'PTS', 'TOV', 'BLK', 'STL', 'AST', 'TRB', '3P', 'FTA', 'FT', 'FGA', 'FG']]
df2 = lakers_data['Fantasy_Points']
df2.head()

738    22
739    20
740    18
921     6
924    21
Name: Fantasy_Points, dtype: int64


import sklearn
from sklearn import linear_model
from sklearn. model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

X = df1.drop('Name', axis=1)
y = df2
X_train, X_test, y_train, y_test = train_test_split(X, y)

reg = linear_model.LinearRegression()
reg.fit(X,y)
predictions1 = reg.predict(X_test)
LinearM = mean_absolute_error(y_test, predictions1)


X = df1.drop('Name', axis=1)
y = df2
X_train, X_test, y_train, y_test = train_test_split(X, y)


from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
param_grid = {
'max_depth': [3, 5, 10, 25, 50, 100, None],
'criterion': ['squared_error', 'friedman_mse', 'absolute_error']
}

grid = GridSearchCV(
    estimator=DecisionTreeRegressor(),
    param_grid=param_grid,
    cv=5,
)

grid.fit(X_train, y_train)
paramsUpdated = grid.best_params_

dtc = DecisionTreeRegressor(criterion = paramsUpdated['criterion'], max_depth = paramsUpdated['max_depth'])
dtc.fit(X_train, y_train)
predictions2 = dtc.predict(X_test)
DTR = mean_absolute_error(y_test, predictions2)


from sklearn.ensemble import RandomForestRegressor
param_grid = {
'max_depth': [3, 5, 10, 25, 50, 100, None],
'criterion': ['squared_error', 'friedman_mse', 'absolute_error']
}

grid = GridSearchCV (RandomForestRegressor(), param_grid, cv=5)
grid.fit(X_train, y_train)
optimal_depth = grid.best_params_['max_depth']
optimal_criterion = grid.best_params_['criterion']
print("Best max depth:", optimal_depth)
print("Best criterion:", optimal_criterion)

Best max depth: 10
Best criterion: squared_error


rfc = RandomForestRegressor(max_depth = optimal_depth, criterion = optimal_criterion)
rfc.fit(X_train, y_train)
predictions3 = rfc.predict(X_test)
RFR = mean_absolute_error(y_test, predictions3)


# creating the dataset for the bar chart
data = {'LinearM':LinearM, 'DTR':DTR, 'RFR':RFR}
courses = list(data.keys())
values = list(data.values())
  
fig = plt.figure(figsize = (10, 5))
 
# creating the bar plot
plt.bar(courses, values, color ='maroon',
        width = 0.4)
 
plt.xlabel("Types of regression models")
plt.ylabel("Mean absolute error value")
plt.title("Regression model vs mean absolute error value")
plt.show()


plt.figure(figsize=(14,12))
plt.title('Comparing Y Values of Different Models', fontdict={'fontweight':'bold', 'fontsize': 18})
plt.xlabel('Date ')
plt.ylabel('Residuals')
i = 0
# print(X_test)
# print(predictions1)
while i < 3:
    if i == 0:
        plt.plot(X_test['Date'], predictions1-y_test, label = "Linear Reg", linestyle="-")
    elif i == 1:
        plt.plot(X_test['Date'],  predictions2-y_test, label = "Decison Tree Reg", linestyle="-")
    else:
        plt.plot(X_test['Date'],  predictions3-y_test, label = "Random Forest Reg", linestyle="-")
    i = i + 1
    
plt.legend()
plt.show()

	Rk	Player	Pos	Age	Tm	G	GS	MP	FG	FGA	...	ORB	DRB	TRB	AST	STL	BLK	TOV	PF	PTS	Player-additional
0	1	Precious Achiuwa	C	22	TOR	73	28	23.6	3.6	8.3	...	2.0	4.5	6.5	1.1	0.5	0.6	1.2	2.1	9.1	achiupr01
1	2	Steven Adams	C	28	MEM	76	75	26.3	2.8	5.1	...	4.6	5.4	10.0	3.4	0.9	0.8	1.5	2.0	6.9	adamsst01
2	3	Bam Adebayo	C	24	MIA	56	56	32.6	7.3	13.0	...	2.4	7.6	10.1	3.4	1.4	0.8	2.6	3.1	19.1	adebaba01
3	4	Santi Aldama	PF	21	MEM	32	0	11.3	1.7	4.1	...	1.0	1.7	2.7	0.7	0.2	0.3	0.5	1.1	4.1	aldamsa01
4	5	LaMarcus Aldridge	C	36	BRK	47	12	22.3	5.4	9.7	...	1.6	3.9	5.5	0.9	0.3	1.0	0.9	1.7	12.9	aldrila01
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
807	601	Thaddeus Young	PF	33	TOR	26	0	18.3	2.6	5.5	...	1.5	2.9	4.4	1.7	1.2	0.4	0.8	1.7	6.3	youngth01
808	602	Trae Young	PG	23	ATL	76	76	34.9	9.4	20.3	...	0.7	3.1	3.7	9.7	0.9	0.1	4.0	1.7	28.4	youngtr01
809	603	Omer Yurtseven	C	23	MIA	56	12	12.6	2.3	4.4	...	1.5	3.7	5.3	0.9	0.3	0.4	0.7	1.5	5.3	yurtsom01
810	604	Cody Zeller	C	29	POR	27	0	13.1	1.9	3.3	...	1.9	2.8	4.6	0.8	0.3	0.2	0.7	2.1	5.2	zelleco01
811	605	Ivica Zubac	C	24	LAC	76	76	24.4	4.1	6.5	...	2.9	5.6	8.5	1.6	0.5	1.0	1.5	2.7	10.3	zubaciv01

	Unnamed: 0	Name	G	Date	Age	Tm	A/H	Opp	Res	GS	...	DRB	TRB	AST	STL	BLK	TOV	PF	PTS	GmSc	+/-
0	0	Precious Achiuwa	1	2021-10-20	22-031	TOR	NaN	WAS	L (-15)	1	...	4	7	2	1	0	1	4	6	3.9	-6.0
1	1	Precious Achiuwa	2	2021-10-22	22-033	TOR	@	BOS	W (+32)	1	...	12	15	0	1	0	0	0	15	15.0	16.0
2	2	Precious Achiuwa	3	2021-10-23	22-034	TOR	NaN	DAL	L (-8)	1	...	9	12	3	0	0	1	3	10	9.6	3.0
3	3	Precious Achiuwa	4	2021-10-25	22-036	TOR	NaN	CHI	L (-3)	1	...	9	11	2	0	0	2	2	11	6.1	-7.0
4	4	Precious Achiuwa	5	2021-10-27	22-038	TOR	NaN	IND	W (+18)	1	...	4	6	0	1	1	0	2	10	8.0	18.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
26007	26007	Ivica Zubac	72	2022-04-01	25-014	LAC	@	MIL	W (+34)	1	...	1	3	4	0	1	0	1	11	11.7	22.0
26008	26008	Ivica Zubac	73	2022-04-03	25-016	LAC	NaN	NOP	W (+19)	1	...	7	14	1	0	1	1	3	16	18.3	3.0
26009	26009	Ivica Zubac	74	2022-04-06	25-019	LAC	NaN	PHO	W (+4)	1	...	10	11	1	0	2	2	4	13	12.6	14.0
26010	26010	Ivica Zubac	75	2022-04-09	25-022	LAC	NaN	SAC	W (+19)	1	...	8	12	2	0	0	0	4	15	14.7	9.0
26011	26011	Ivica Zubac	76	2022-04-10	25-023	LAC	NaN	OKC	W (+50)	1	...	7	11	2	0	2	1	2	12	13.3	18.0

	Unnamed: 0	Name	G	Date	Age	Tm	A/H	Opp	Res	GS	...	DRB	TRB	AST	STL	BLK	TOV	PF	PTS	GmSc	+/-
0	0	Precious Achiuwa	1	2021-10-20	22-031	TOR	H	WAS	L (-15)	1	...	4	7	2	1	0	1	4	6	3.9	-6.0
1	1	Precious Achiuwa	2	2021-10-22	22-033	TOR	A	BOS	W (+32)	1	...	12	15	0	1	0	0	0	15	15.0	16.0
2	2	Precious Achiuwa	3	2021-10-23	22-034	TOR	H	DAL	L (-8)	1	...	9	12	3	0	0	1	3	10	9.6	3.0
3	3	Precious Achiuwa	4	2021-10-25	22-036	TOR	H	CHI	L (-3)	1	...	9	11	2	0	0	2	2	11	6.1	-7.0
4	4	Precious Achiuwa	5	2021-10-27	22-038	TOR	H	IND	W (+18)	1	...	4	6	0	1	1	0	2	10	8.0	18.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
26007	26007	Ivica Zubac	72	2022-04-01	25-014	LAC	A	MIL	W (+34)	1	...	1	3	4	0	1	0	1	11	11.7	22.0
26008	26008	Ivica Zubac	73	2022-04-03	25-016	LAC	H	NOP	W (+19)	1	...	7	14	1	0	1	1	3	16	18.3	3.0
26009	26009	Ivica Zubac	74	2022-04-06	25-019	LAC	H	PHO	W (+4)	1	...	10	11	1	0	2	2	4	13	12.6	14.0
26010	26010	Ivica Zubac	75	2022-04-09	25-022	LAC	H	SAC	W (+19)	1	...	8	12	2	0	0	0	4	15	14.7	9.0
26011	26011	Ivica Zubac	76	2022-04-10	25-023	LAC	H	OKC	W (+50)	1	...	7	11	2	0	2	1	2	12	13.3	18.0

	Unnamed: 0	Name	G	Date	Age	Tm	A/H	Opp	Res	GS	...	TRB	AST	STL	BLK	TOV	PF	PTS	GmSc	+/-	Fantasy_Points
0	0	Precious Achiuwa	1	2021-10-20	22-031	TOR	H	WAS	L (-15)	1	...	7	2	1	0	1	4	6	3.9	-6.0	15
1	1	Precious Achiuwa	2	2021-10-22	22-033	TOR	A	BOS	W (+32)	1	...	15	0	1	0	0	0	15	15.0	16.0	34
2	2	Precious Achiuwa	3	2021-10-23	22-034	TOR	H	DAL	L (-8)	1	...	12	3	0	0	1	3	10	9.6	3.0	24
3	3	Precious Achiuwa	4	2021-10-25	22-036	TOR	H	CHI	L (-3)	1	...	11	2	0	0	2	2	11	6.1	-7.0	17
4	4	Precious Achiuwa	5	2021-10-27	22-038	TOR	H	IND	W (+18)	1	...	6	0	1	1	0	2	10	8.0	18.0	21
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
26007	26007	Ivica Zubac	72	2022-04-01	25-014	LAC	A	MIL	W (+34)	1	...	3	4	0	1	0	1	11	11.7	22.0	26
26008	26008	Ivica Zubac	73	2022-04-03	25-016	LAC	H	NOP	W (+19)	1	...	14	1	0	1	1	3	16	18.3	3.0	38
26009	26009	Ivica Zubac	74	2022-04-06	25-019	LAC	H	PHO	W (+4)	1	...	11	1	0	2	2	4	13	12.6	14.0	33
26010	26010	Ivica Zubac	75	2022-04-09	25-022	LAC	H	SAC	W (+19)	1	...	12	2	0	0	0	4	15	14.7	9.0	33
26011	26011	Ivica Zubac	76	2022-04-10	25-023	LAC	H	OKC	W (+50)	1	...	11	2	0	2	1	2	12	13.3	18.0	35

	Unnamed: 0	Name	G	Date	Age	Tm	A/H	Opp	Res	GS	...	TRB	AST	STL	BLK	TOV	PF	PTS	GmSc	+/-	Fantasy_Points
738	738	Carmelo Anthony	14	2021-11-14	37-169	LAL	H	SAS	W (+8)	1	...	5	0	0	0	1	4	15	10.6	-6.0	22
739	739	Carmelo Anthony	15	2021-11-15	37-170	LAL	H	CHI	L (-18)	1	...	4	1	2	0	0	2	9	6.7	-21.0	20
740	740	Carmelo Anthony	16	2021-11-17	37-172	LAL	A	MIL	L (-7)	1	...	6	1	1	0	2	2	10	5.3	1.0	18
921	921	Trevor Ariza	5	2022-01-04	36-188	LAL	H	SAC	W (+8)	1	...	3	0	0	1	0	0	0	0.9	-8.0	6
924	924	Trevor Ariza	8	2022-01-12	36-196	LAL	A	SAC	L (-9)	1	...	5	3	3	0	1	2	2	5.2	-14.0	21
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
24725	24725	Russell Westbrook	74	2022-03-29	33-137	LAL	A	DAL	L (-18)	1	...	8	6	0	0	2	3	25	20.1	-25.0	43
24726	24726	Russell Westbrook	75	2022-03-31	33-139	LAL	A	UTA	L (-13)	1	...	6	7	0	0	3	2	24	16.7	-17.0	34
24727	24727	Russell Westbrook	76	2022-04-01	33-140	LAL	H	NOP	L (-3)	1	...	4	5	0	0	1	6	12	5.6	-12.0	21
24728	24728	Russell Westbrook	77	2022-04-03	33-142	LAL	H	DEN	L (-11)	1	...	10	7	2	1	2	5	27	26.3	1.0	61
24729	24729	Russell Westbrook	78	2022-04-05	33-144	LAL	A	PHO	L (-11)	1	...	5	3	0	0	6	2	28	14.4	-9.0	27

NBA Fantasy Basketball Analysis Tutorial by Andrew Zheng and Zayn.A.Hijazi

1. Introduction

2. Data Manipulation

2.1 Scraping Player Data

2.2 Cleaning the Data

3. Visualizing the Data

4. Hypothesis Testing

Machine Learning and the Prediction of scores:

Linear Regression Model:

Decision Tree Regression Model:

Random Forest Regression Model:

Comparison of Models Using MAE:

Trying to visualzie our models

Conclusion