#INITIAL SETTINGS
#NB put this file in the same folder as Teams_overall2020.csv
## ------------------------------------------------------------------------
#set working directory (the function below sets the wd in the folder where this script is)
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
rm(list=ls())



# install and load the relevant packages
# packages used
listofpackages <- c("lrmest", "dplyr","assertthat","bindrcpp","glue","pkgconfig","utf8","cli","ellipse","reshape2","ggplot2","dygraphs","aod")

for (j in listofpackages){
  if(sum(installed.packages()[, 1] == j) == 0) {
    install.packages(j)
  }
  library(j, character.only = T)
}


NBAdata=read.csv("C:/Users/favero/Dropbox/exam/SPORTMAN/R/data_L_2020/Teams_overall2020.csv", header = T, stringsAsFactors = F, sep = ";")
typeof(NBAdata)#to check the type of data
## ------------------------------------------------------------------------
## ------------------------------------------------------------------------
## ------------------------------------------------------------------------
## ------------------------------------------------------------------------
#1. Using regression analysis assess the relative performance of the NBA Efficiency measure and of the model
#proposed in the class to explain WINS over the sample of seasons 1994-2005 (omitting season 1999)

#DATA TRANSFORMATION 
## ------------------------------------------------------------------------

NBAdata$FGMISS=NBAdata$FGA-NBAdata$FG
NBAdata$FTMISS=NBAdata$FTA-NBAdata$FT
NBAdata$OFGMISS=NBAdata$OFGA-NBAdata$OFG
NBAdata$OFTMISS=NBAdata$OFTA-NBAdata$OFT
NBAdata$MISS=NBAdata$FGMISS+NBAdata$FTMISS
NBAdata$OMISS=NBAdata$OFGMISS+NBAdata$OFTMISS
NBAdata$W.=NBAdata$W/(NBAdata$W+NBAdata$L)
## ------------------------------------------------------------------------

#NBA EFFICIENCY MEASURES
## ------------------------------------------------------------------------

NBAdata$eff_NBA=NBAdata$PTS+NBAdata$TRB+NBAdata$STL+NBAdata$BLK-NBAdata$MISS-NBAdata$TOV
NBAdata$eff_O_NBA=NBAdata$OPTS+NBAdata$OTRB+NBAdata$OSTL+NBAdata$OBLK-NBAdata$OMISS-NBAdata$OTOV
NBAdata$rel_effNBA=NBAdata$eff_NBA-NBAdata$eff_O_NBA
## ------------------------------------------------------------------------

#MODELLING PROCESS
#EMPLOYED POSSESSIONS
## ------------------------------------------------------------------------

NBAdata$empl_poss=NBAdata$FGA + 0.45*NBAdata$FTA + NBAdata$TOV - NBAdata$ORB
NBAdata$ptsxgame=NBAdata$PTS/NBAdata$G
NBAdata$ptsxposs=NBAdata$PTS/NBAdata$empl_poss
## ------------------------------------------------------------------------

#ACQUIRED POSSESSIONS
## ------------------------------------------------------------------------
#Field goal attempt differenced
NBAdata$FGAD=NBAdata$FGA-NBAdata$OFG-NBAdata$OTOV-NBAdata$TRB+NBAdata$TOV

#Regression Analysis to check variable construction
reg9405 = subset(NBAdata,(Season<2006 & Season>1993 & Season!=1999),select=c(FGAD,OFT,FTA))
reg_1 = lm(reg9405$FGAD ~ reg9405$OFT + reg9405$FTA )
summary(reg_1)
#Hypothesis Testing
wald.test(vcov(reg_1), coef(reg_1), H0 = c(0.45, -0.45), df = 313,
          L = matrix(c(0, 0, 1, 0,
                       0, 1), ncol = 3), verbose = T)

#USING Regressions results to create proxy
#NBAdata$TEAM_R1=reg_1$coefficient[1]+reg_1$residuals
NBAdata$TEAM_R=NBAdata$FGAD-0.45*NBAdata$OFT+0.45*NBAdata$FTA
NBAdata$TEAM_R2=NBAdata$FGA-NBAdata$OFG-NBAdata$OTOV-NBAdata$TRB+NBAdata$TOV-0.45*NBAdata$OFT+0.45*NBAdata$FTA
TeamReb_test=NBAdata$TEAM_R2-NBAdata$TEAM_R
#TeamReb_test


# ALTERNATIVE SOLUTION 
# reg9405$TEAM_R=reg_1$coefficient[1]+reg_1$residuals
# reg9405$TEAM_R1=reg9405$FGAD-0.45*reg9405$OFT+0.45*reg9405$FTA
# plot(x = reg9405$TEAM_R, y = reg9405$TEAM_R1)

plot(x = NBAdata$Season, y = NBAdata$TEAM_R, main = "Estimated Team Rebounds", ylab = "TR",ylim = c(200,4000), xlab = "Seasons",col = "red")
points(x = NBAdata$Season, y = NBAdata$DRB,col = "blue")

NBAdata$acq_poss=NBAdata$OTOV + NBAdata$DRB+NBAdata$TEAM_R+ NBAdata$OFG + 0.45*NBAdata$OFT
NBAdata$ptsaxgame=NBAdata$OPTS/NBAdata$G
NBAdata$ptsall_poss=NBAdata$OPTS/NBAdata$acq_poss

## ------------------------------------------------------------------------
#CONSTRUCTED EFFICIENCY (POINTSXPOSSESSION - OPPOINTSXACQUIREDPOSSESSION)
## ------------------------------------------------------------------------
NBAdata$eff=(NBAdata$ptsxposs-NBAdata$ptsall_poss)

## ------------------------------------------------------------------------
# MODEL ESTIMATION measuring the impact of defensive and offensive efficiency
## ------------------------------------------------------------------------

s9405 = subset(NBAdata,(Season>1993 & Season<2006 & Season!=1999))
plot(x = s9405$Season, y = s9405$W, main = "WINS", ylab = "TR",ylim = c(0,85), xlab = "Seasons",col = "red")

reg_NBA_eff = lm(s9405$W ~  s9405$eff_NBA + s9405$eff_O_NBA) #Regression using NBA efficiency mesures
summary(reg_NBA_eff)
summary(reg_NBA_eff)$r.squared
adjRsq_NBA_eff=0.9118
# summary(reg_NBA_eff)$adj.r.squared

reg_NBA_releff = lm(s9405$W ~  s9405$rel_effNBA ) #Regression using NBA relative efficiency mesure
summary(reg_NBA_releff)
summary(reg_NBA_releff)$r.squared
adjRsq_NBA_releff=0.9108


reg_MODEL=lm(s9405$W ~  s9405$eff)
summary(reg_MODEL)
summary(reg_MODEL)$r.squared
adjRsq_MODEL= 0.9479

reg_both=lm(s9405$W ~  s9405$eff + s9405$eff_NBA + s9405$eff_O_NBA )
summary(reg_both)
summary(reg_both)$r.squared
adjRsq_both= 0.9479
# ALTERNATIVE SOLUTION 
#fit1=reg_NBA_releff$fitted.values
#fit2=reg_MODEL$fitted.values
#reg_both=lm(s9405$W ~ fit1 + fit2  )
#summary(reg_both)
## ------------------------------------------------------------------------
#ANSWER
#The performance model-constructed efficiency seems to be higher 
#Considering that the parameter of the first 3 regressions are all significant at 0.001 level
#The R square is higher when the regressor is the modelled efficiency.
#Hence, it is able to better explain the variability of the model.
#Moreover, adding in the third regression the NBA efficiency measures 
#the adjR^2 (purified by degrees of freedom) remains the same (0.9479). 
#This means that, adding them, there are not more informations.
#In fact, in the fourth (reg_both) regression, the coefficients of NBA efficiency measures are not significant at any level.
#This is the model that allows us to assess the validity of the model against the NBA efficiency.
#In fact, there is evidence that the new efficency model dominates the NBA efficiency model in term of explaining WINS.


#2. Consider the baseline MODEL for WINS estimated in the lecture over the sample 1994-2005.

#a. The model estimated in the lecture drops Season 1998-99. Why? 

#ANSWER
#Season 1999 is excluded because of strikes that occured during that year.
#Therefore, number of games is smaller than a regular season. All variables are affected by this situation.

#What happens to the results obtained in class if you extend the sample by including Season 1998-1999:

#IMPACT CALCULATION
## ------------------------------------------------------------------------
regeff_1999 = subset(NBAdata,(Season>1993 & Season<2006))
s9405 = subset(NBAdata,(Season>1993 & Season<2006 & Season!=1999))
reg_NBA_1999 = lm(regeff_1999$W ~  regeff_1999$eff_NBA + regeff_1999$eff_O_NBA )
summary(reg_NBA_1999)

reg_MODEL_1999=lm(regeff_1999$W ~  regeff_1999$eff)
summary(reg_MODEL_1999)
summary(reg_MODEL)


R2_ratio_NBA = summary(reg_NBA_1999)$r.squared/summary(reg_NBA_eff)$r.squared
R2_ratio_NBA #greater than 1. Linear regression is not sensitive to the presence of outliers


R2_ratio_model = summary(reg_MODEL_1999)$r.squared/summary(reg_MODEL)$r.squared
R2_ratio_model #lower than 1. Linear regression is sensitive to the presence of outliers
## ------------------------------------------------------------------------
#ANSWER
#R2 decrease for all regressions due to Season 1999's strikes (outlier).
#The drop in R2 is bigger for the regression where the regressor is the modelled efficiency.
#The coefficents of both regressions remain significant at 0.001 level.


#b. Add to the baseline model a Season effect and a Team effect and test their significance.

#Dataset s9405
#The two dummies are constructed as following:
## ------------------------------------------------------------------------

s9405$SeasonEffect<- ifelse(s9405$Season=="2002",1,0)
  
s9405$TeamEffect<- ifelse(s9405$Team=="Chicago Bulls",1,0)
## ------------------------------------------------------------------------
# Model estimation including the two dummies 

reg_M_w_dummies=lm(s9405$W ~  s9405$eff + s9405$SeasonEffect + s9405$TeamEffect)

summary(reg_M_w_dummies)
summary(reg_M_w_dummies)$r.squared

# ALTERNATIVE SPECIFICATION 
#s9405$Season_dum=as.character(s9405$Season)
#reg_M_wall_dummies=lm(s9405$W ~  s9405$eff + s9405$Team +s9405$Season_dum)
#summary(reg_M_wall_dummies)


#ANSWER
#The coefficients of the two dummies are not significant at any level and the Rsqr does not increase.
#Hence, Season Effect and Team Effect dummies are not able to add relevant informations to the model 
#and to explain additional variance.

#3. Compare the Team Rebounds constructed by the formula used in the baseline programme with 
#the proxy obtained by considering the sum of residuals and estimated constant in the regression of 
#Total Possession Differenced on Free Throws and Opponent Free Throws

#Team Rebounds w/sum of residuals and constant in the regression of Total Possession Differenced 
#on Free Throws and Opponent Free Throws
## ------------------------------------------------------------------------
NBAdata$empl_poss_diff=NBAdata$FGA+NBAdata$TOV-NBAdata$ORB
NBAdata$acq_poss_diff=NBAdata$OTOV+NBAdata$DRB+NBAdata$OFG
NBAdata$tot_poss_diff=NBAdata$empl_poss_diff-NBAdata$acq_poss_diff
NBAdata$TEAM_R=NBAdata$FGAD-0.45*NBAdata$OFT+0.45*NBAdata$FTA

regTPD = subset(NBAdata,(Season<2006 & Season>1993 & Season!=1999),select=c(tot_poss_diff,OFT,FTA,TEAM_R))
reg_2 = lm(regTPD$tot_poss_diff ~ regTPD$OFT + regTPD$FTA )
summary(reg_2)

TEAM_R_TPD=coef(reg_2)[1]+reg_2$residuals
## ------------------------------------------------------------------------

compare=abs(TEAM_R_TPD-regTPD$TEAM_R)
print(compare) 

#ANSWER
#The difference between  is on average 51 in absolute value

#3. Using the simulation model
#(a) derive the effect of a made free throws on WINS

# DETERMINISTIC  SIMULATION
## ------------------------------------------------------------------------
#baseline no shock, specify values for each stats 
FTA=mean(s9405$FTA)
TOV=mean(s9405$TOV)
ORB=mean(s9405$ORB)
X3P=mean(s9405$X3P)
X2P=mean(s9405$X2P)
FGMISS=mean(s9405$FGMISS)
FGA=X3P+X2P+FGMISS
FT=mean(s9405$FT)
OTOV=mean(s9405$OTOV)
DRB=mean(s9405$DRB)
TEAM_R=mean(s9405$TEAM_R)
OFG=mean(s9405$OFG)
OFT=mean(s9405$OFT)
OX3P=mean(s9405$O3P)
OX2P=mean(s9405$O2P)
OFT=mean(s9405$OFT)

# identities
PTS=3*X3P+2*X2P+1*FT
empl_poss=FGA + 0.44*FTA + TOV - ORB
pts_poss=PTS/empl_poss
OPTS=3*OX3P+2*OX2P+1*OFT
acq_poss=OTOV + DRB+TEAM_R+ OFG + 0.45*OFT
ptsall_poss=OPTS/acq_poss

#ALTERNATIVE SCENARIO CHOOSE the SHOCK

FTA_AL=mean(s9405$FTA)
TOV_AL=mean(s9405$TOV)
ORB_AL=mean(s9405$ORB)
X3P_AL=mean(s9405$X3P)
X2P_AL=mean(s9405$X2P)
FGMISS_AL=mean(s9405$FGMISS)
FGA_AL=X3P_AL+X2P_AL+FGMISS_AL
FT_AL=mean(s9405$FT)+1
OTOV_AL=mean(s9405$OTOV)
DRB_AL=mean(s9405$DRB)
TEAM_R_AL=mean(s9405$TEAM_R)
OFG_AL=mean(s9405$OFG)
OFT_AL=mean(s9405$OFT)
OX3P_AL=mean(s9405$O3P)
OX2P_AL=mean(s9405$O2P)
OFT_AL=mean(s9405$OFT)

# identities
PTS_AL=3*X3P_AL+2*X2P_AL+1*FT_AL
empl_poss_AL=FGA_AL + 0.44*FTA_AL + TOV_AL - ORB_AL
pts_poss_AL=PTS_AL/empl_poss_AL
OPTS_AL=3*OX3P_AL+2*OX2P_AL+1*OFT_AL
acq_poss_AL=OTOV_AL + DRB_AL+TEAM_R_AL+ OFG_AL + 0.45*OFT_AL
ptsall_poss_AL=OPTS_AL/acq_poss_AL

# now add the last equation and simulate WINS under the two scenarios 
W_BL <- reg_MODEL$coefficients[1] +reg_MODEL$coefficients[2]*(pts_poss-ptsall_poss)
W_AL <- reg_MODEL$coefficients[1] +reg_MODEL$coefficients[2]*(pts_poss_AL-ptsall_poss_AL)
VAL=W_AL-W_BL
VAL 
## ------------------------------------------------------------------------

#ANSWER
#3% change in WINS estimation by an increase in 1 FT

#(b) compare the distribution of wins form the simulated baseline model with 
#that observed in the data and comment on the difference between them.

# STOCHASTIC  SIMULATION
## ------------------------------------------------------------------------
#baseline no shock
FTA=mean(s9405$FTA)
TOV=mean(s9405$TOV)
ORB=mean(s9405$ORB)
X3P=mean(s9405$X3P)
X2P=mean(s9405$X2P)
FGMISS=mean(s9405$FGMISS)
FGA=X3P+X2P+FGMISS
FT=mean(s9405$FT)
OTOV=mean(s9405$OTOV)
DRB=mean(s9405$DRB)
TEAM_R=mean(s9405$TEAM_R)
OFG=mean(s9405$OFG)
OFT=mean(s9405$OFT)
OX3P=mean(s9405$O3P)
OX2P=mean(s9405$O2P)
OFT=mean(s9405$OFT)

# identities
PTS=3*X3P+2*X2P+1*FT
empl_poss=FGA + 0.44*FTA + TOV - ORB
pts_poss=PTS/empl_poss
OPTS=3*OX3P+2*OX2P+1*OFT
acq_poss=OTOV + DRB+TEAM_R+ OFG + 0.45*OFT
ptsall_poss=OPTS/acq_poss

#WINS from data
hist(NBAdata$W, breaks = seq(min(NBAdata$W), max(NBAdata$W), l = 20+1),prob=TRUE, main = "WINS from data")
curve(dnorm(x,mean=mean(NBAdata$W),sd=sd(NBAdata$W)),col='darkblue',lwd=2,add=TRUE)
up=mean(NBAdata$W)+2*sd(NBAdata$W)
down=mean(NBAdata$W)-2*sd(NBAdata$W)
#on avg data tell us that number of wins in a season is 40. 95% of obs are in the interval (14.7,66)

# preparing to simulate two scenarios  simulation
nrep <- 10^3
tT <- 1 # 1 obs
W_BL <- VAL_STAT <- array(0, c(tT, nrep)) # the containers

for (i in 1:nrep){
  
  ### Monte Carlo
  # standard normal times the standard error of the estimatedof the relevant regression
  x <- rt( n=tT, df=314 )*summary(reg_MODEL)$coefficients[2,2]
  
  # simulating wins under the two scenarios and the effect of a stat
  
  W_BL[, i] <- reg_MODEL$coefficients[1] +(reg_MODEL$coefficients[2]+x)*(pts_poss-ptsall_poss)
 
}

VAL=W_BL
hist(VAL, breaks = seq(min(VAL), max(VAL), l = 20+1),prob=TRUE, main = "histogram of effects")
curve(dnorm(x,mean=mean(VAL),sd=sd(VAL)),col='darkblue',lwd=2,add=TRUE)
up_BL=mean(VAL)+2*sd(VAL)
down_BL=mean(VAL)-2*sd(VAL)
## ------------------------------------------------------------------------

#ANSWER
#Stochastic simulations with baseline model predicts 41 wins on avg and very low variance. 
#It is almost deterministic because range of predicted values is very small