# install and load the relevant packages # packages used listofpackages <- c("lrmest", "dplyr","assertthat","bindrcpp","glue","pkgconfig","utf8","cli","ellipse","reshape2","ggplot2","dygraphs","aod") for (j in listofpackages){ if(sum(installed.packages()[, 1] == j) == 0) { install.packages(j) } library(j, character.only = T) } #setwd(path) setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) #clear the environment rm(list=ls()) NBAdata=read.csv("C:/Users/favero/Dropbox/exam/SPORTMAN/R/data_L_2020/Teams_overall2020.csv", header = T, stringsAsFactors = F, sep = ";") head(NBAdata) typeof(NBAdata)#to check the type of data #remember that season 2004/2005 is coded as 2005 and Oklahoma City Thunder were Seattle Supersonics until 2007 ## ------------------------------------------------------------------------ #DATA TRANSFORMATION ## ------------------------------------------------------------------------ NBAdata$FGMISS=NBAdata$FGA-NBAdata$FG NBAdata$FTMISS=NBAdata$FTA-NBAdata$FT NBAdata$OFGMISS=NBAdata$OFGA-NBAdata$OFG NBAdata$OFTMISS=NBAdata$OFTA-NBAdata$OFT NBAdata$MISS=NBAdata$FGMISS+NBAdata$FTMISS NBAdata$OMISS=NBAdata$OFGMISS+NBAdata$OFTMISS NBAdata$W.=NBAdata$W/(NBAdata$W+NBAdata$L) #NBA EFFICIENCY MEASURES NBAdata$eff_NBA=NBAdata$PTS+NBAdata$TRB+NBAdata$STL+NBAdata$BLK-NBAdata$MISS-NBAdata$TOV NBAdata$eff_O_NBA=NBAdata$OPTS+NBAdata$OTRB+NBAdata$OSTL+NBAdata$OBLK-NBAdata$OMISS-NBAdata$OTOV NBAdata$rel_effNBA=NBAdata$eff_NBA-NBAdata$eff_O_NBA #CORRELATION ANALYSIS datacor00 = subset(NBAdata,(Season<2006 & Season>1994),select=c(W.,DRB,FGMISS,AST,PTS,TOV,PF,ORB,STL,BLK,FTMISS)) datacor00 = na.omit(datacor00) cor.datacor = cor(datacor00, use="complete.obs") cor.datacor # nice graphics presentation of correlations needs package ellipse ord = order(cor.datacor[1,]) ordered.cor.datacor = cor.datacor[ord, ord] plotcorr(ordered.cor.datacor, col=cm.colors(11)[5*ordered.cor.datacor + 6]) # even nicer correlation heatmap needs packages reshape2 ggplot2 cormat = round(cor(datacor00),2) head(cormat) melted_cormat = melt(cormat) head(melted_cormat) ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) + geom_tile() # Get lower triangle of the correlation matrix get_lower_tri=function(cormat){ cormat[upper.tri(cormat)] = NA return(cormat) } # Get upper triangle of the correlation matrix get_upper_tri = function(cormat){ cormat[lower.tri(cormat)] = NA return(cormat) } upper_tri = get_upper_tri(cormat) upper_tri # Melt the correlation matrix melted_cormat = melt(upper_tri, na.rm = TRUE) # Heatmap ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+ geom_tile(color = "white")+ scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1,1), space = "Lab", name="Pearson\nCorrelation") + theme_minimal()+ theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1))+ coord_fixed() ## ------------------------------------------------------------------------ # REGRESSIONS WITHOUT A MODEL ## ------------------------------------------------------------------------ REG = subset(NBAdata,(Season<2006 & Season>1994),select=c(W,W.,DRB,FGMISS,AST,PTS,TOV,PF,ORB,STL,BLK,FTMISS,ODRB,OFGMISS,OAST,OPTS,OTOV,OPF,OORB,OSTL,OBLK,OFTMISS,eff_NBA,eff_O_NBA,rel_effNBA)) reg_op1 = lm(W~ PTS+ORB+DRB+AST+STL+BLK+FGMISS+FTMISS+TOV,REG ) summary(reg_op1) reg_op2 = lm(W~ PTS+ORB+DRB+AST+STL+BLK+FGMISS+FTMISS+TOV+OPTS+OORB+ODRB+OAST+OSTL+OBLK+OFGMISS+OFTMISS+OTOV,REG ) summary(reg_op2) reg_up1 = lm(W~eff_NBA+eff_O_NBA,REG ) summary(reg_up1) reg_up2 = lm(W~rel_effNBA,REG ) summary(reg_up2) plot(REG$W, pch=20, ylim=c(10, 82), xaxt="n", xlab="Team and Time", ylab="Wins",col = "blue") lines(reg_up1$fitted.values,col = "red",pch=8) lines(reg_up2$fitted.values,col = "green",pch=8) plot(reg_up1$residuals, pch=20, ylim=c(-20, 20), xaxt="n", xlab="Team and Time", ylab="Wins",col = "blue") lines(reg_up2$residuals,col = "red",pch=8) ## ------------------------------------------------------------------------ # REGRESSIONS WITH A MODEL ## ------------------------------------------------------------------------ ## ------------------------------------------------------------------------ #DATA TRANSFORMATION ## ------------------------------------------------------------------------ #EMPLOYED POSSESSIONS NBAdata$empl_poss=NBAdata$FGA + 0.44*NBAdata$FTA + NBAdata$TOV - NBAdata$ORB NBAdata$ptsxgame=NBAdata$PTS/NBAdata$G NBAdata$ptsxposs=NBAdata$PTS/NBAdata$empl_poss #ACQUIRED POSSESSIONS #Field goal attempt differenced NBAdata$FGAD=NBAdata$FGA-NBAdata$OFG-NBAdata$OTOV-NBAdata$TRB+NBAdata$TOV #Regression Analysis to check variable construction reg9405 = subset(NBAdata,(Season<2006 & Season>1993 & Season!=1999),select=c(FGAD,OFT,FTA)) reg_1 = lm(reg9405$FGAD ~ reg9405$OFT + reg9405$FTA ) summary(reg_1) #Hypothesis Testing wald.test(vcov(reg_1), coef(reg_1), H0 = c(0.45, -0.45), df = 313, L = matrix(c(0, 0, 1, 0, 0, 1), ncol = 3), verbose = T) #USING Regressions results to create proxy #NBAdata$TEAM_R=coef(reg_1)[1]+reg_1$residuals NBAdata$TEAM_R=NBAdata$FGAD-0.45*NBAdata$OFT+0.45*NBAdata$FTA plot(x = NBAdata$Season, y = NBAdata$TEAM_R, main = "Estimated Team Rebounds", ylab = "TR",ylim = c(200,4000), xlab = "Seasons",col = "red") points(x = NBAdata$Season, y = NBAdata$DRB,col = "blue") #NBAdata$TEAM_R1=NBAdata$OMISS-NBAdata$DRB-NBAdata$ORB #plot(x = NBAdata$Season, y = NBAdata$TEAM_R, main = "Estimated Team Rebounds", ylab = "TR",ylim = c(200,1500), xlab = "Seasons",col = "red") #points(x = NBAdata$Season, y = NBAdata$TEAM_R1,col = "blue") NBAdata$acq_poss=NBAdata$OTOV + NBAdata$DRB+NBAdata$TEAM_R+ NBAdata$OFG + 0.45*NBAdata$OFT NBAdata$ptsaxgame=NBAdata$OPTS/NBAdata$G NBAdata$ptsall_poss=NBAdata$OPTS/NBAdata$acq_poss NBAdata$eff=(NBAdata$ptsxposs-NBAdata$ptsall_poss) plot(x = NBAdata$Season, y = NBAdata$acq_poss, main = "Estimated Possession", ylab = "TR",ylim = c(6000,10000), xlab = "Seasons",col = "red") points(x = NBAdata$Season, y = NBAdata$empl_poss,col = "blue") DATA_AH=subset(NBAdata,(Team== "Atlanta Hawks"),select=c(Season,acq_poss,empl_poss)) plot(x = DATA_AH$Season, y = DATA_AH$acq_poss, main = "Atlanta Hawks", ylab = "TPOSS",ylim = c(6000,10000), xlab = "Seasons",col = "red") points(x = DATA_AH$Season, y = DATA_AH$empl_poss,col = "blue") ## ------------------------------------------------------------------------ # DESCRIPTIVE ANALYSIS ## ------------------------------------------------------------------------ #What we want to see now is the most efficient team in employing possessions TABLE_6.2=subset(NBAdata,(Season== 2005),select=c(Team,ptsxposs,ptsxgame)) ranking6_2PTSxP = TABLE_6.2[order(TABLE_6.2$ptsxposs, TABLE_6.2$Team, decreasing = TRUE),] #similarly we rank teams wrt points scored per game ranking6_2PTSxG = TABLE_6.2[order(TABLE_6.2$ptsxgame, TABLE_6.2$Team, decreasing = TRUE),] #same ranking for the acquired possessions TABLE_6.3=subset(NBAdata,(Season== 2005),select=c(Team,ptsall_poss,ptsaxgame)) ranking6_3PTSAxPA = TABLE_6.3[order(TABLE_6.3$ptsall_poss, TABLE_6.3$Team, decreasing = FALSE),] ranking6_3PTSAxG = TABLE_6.3[order(TABLE_6.3$ptsaxgame, TABLE_6.3$Team, decreasing = FALSE),] # MODEL ESTIMATION measuring the impact of defensive and offensive efficiency regeff = subset(NBAdata,(Season>1993 & Season<2006 & Season!=1999)) regeff$Season_dum=as.character(regeff$Season) plot(x = regeff$Season, y = regeff$W, main = "WINS", ylab = "TR",ylim = c(0,85), xlab = "Seasons",col = "red") reg_1 = lm(regeff$W ~ regeff$PTS + regeff$OPTS) summary(reg_1) reg_2 = lm(regeff$W ~ regeff$ptsall_poss+regeff$ptsxposs ) summary(reg_2) reg_3 = lm(regeff$W ~ regeff$eff ) summary(reg_3) reg_4=lm(regeff$W ~ regeff$eff+regeff$eff_NBA + regeff$eff_O_NBA ) summary(reg_4) plot(regeff$W, pch=20, ylim=c(15, 82), ylab="Wins",col = "blue") lines(reg_3$fitted.values,col = "red", lwd = 2,type="l") # storing the residuals resid_3 <- reg_3$residuals ## ------------------------------------------------------------------------ # MODEL SIMULATION ## ------------------------------------------------------------------------ ## ------------------------------------------------------------------------ # DETERMINISTIC SIMULATION ## ------------------------------------------------------------------------ #baseline no shock, specify values for each stats FTA=mean(regeff$FTA) TOV=mean(regeff$TOV) ORB=mean(regeff$ORB) X3P=mean(regeff$X3P) X2P=mean(regeff$X2P) FGMISS=mean(regeff$FGMISS) FGA=X3P+X2P+FGMISS FT=mean(regeff$FT) OTOV=mean(regeff$OTOV) DRB=mean(regeff$DRB) TEAM_R=mean(regeff$TEAM_R) OFG=mean(regeff$OFG) OFT=mean(regeff$OFT) OX3P=mean(regeff$O3P) OX2P=mean(regeff$O2P) OFT=mean(regeff$OFT) # identities PTS=3*X3P+2*X2P+1*FT empl_poss=FGA + 0.44*FTA + TOV - ORB pts_poss=PTS/empl_poss OPTS=3*OX3P+2*OX2P+1*OFT acq_poss=OTOV + DRB+TEAM_R+ OFG + 0.45*OFT ptsall_poss=OPTS/acq_poss #ALTERNATIVE SCENARIO CHOOSE the SHOCK FTA_AL=mean(regeff$FTA) TOV_AL=mean(regeff$TOV) ORB_AL=mean(regeff$ORB) X3P_AL=mean(regeff$X3P)+1 X2P_AL=mean(regeff$X2P) FGMISS_AL=mean(regeff$FGMISS) FGA_AL=X3P_AL+X2P_AL+FGMISS_AL FT_AL=mean(regeff$FT) OTOV_AL=mean(regeff$OTOV) DRB_AL=mean(regeff$DRB) TEAM_R_AL=mean(regeff$TEAM_R) OFG_AL=mean(regeff$OFG) OFT_AL=mean(regeff$OFT) OX3P_AL=mean(regeff$O3P) OX2P_AL=mean(regeff$O2P) OFT_AL=mean(regeff$OFT) # identities PTS_AL=3*X3P_AL+2*X2P_AL+1*FT_AL empl_poss_AL=FGA_AL + 0.44*FTA_AL + TOV_AL - ORB_AL pts_poss_AL=PTS_AL/empl_poss_AL OPTS_AL=3*OX3P_AL+2*OX2P_AL+1*OFT_AL acq_poss_AL=OTOV_AL + DRB_AL+TEAM_R_AL+ OFG_AL + 0.45*OFT_AL ptsall_poss_AL=OPTS_AL/acq_poss_AL # now add the last equation and simulate WINS under the two scenarios W_BL <- reg_3$coefficients[1] +reg_3$coefficients[2]*(pts_poss-ptsall_poss) W_AL <- reg_3$coefficients[1] +reg_3$coefficients[2]*(pts_poss_AL-ptsall_poss_AL) VAL=W_AL-W_BL VAL ## ------------------------------------------------------------------------ # STOCHASTIC SIMULATION ## ------------------------------------------------------------------------ #baseline no shock FTA=mean(regeff$FTA) TOV=mean(regeff$TOV) ORB=mean(regeff$ORB) X3P=mean(regeff$X3P) X2P=mean(regeff$X2P) FGMISS=mean(regeff$FGMISS) FGA=X3P+X2P+FGMISS FT=mean(regeff$FT) OTOV=mean(regeff$OTOV) DRB=mean(regeff$DRB) TEAM_R=mean(regeff$TEAM_R) OFG=mean(regeff$OFG) OFT=mean(regeff$OFT) OX3P=mean(regeff$O3P) OX2P=mean(regeff$O2P) OFT=mean(regeff$OFT) # identities PTS=3*X3P+2*X2P+1*FT empl_poss=FGA + 0.44*FTA + TOV - ORB pts_poss=PTS/empl_poss OPTS=3*OX3P+2*OX2P+1*OFT acq_poss=OTOV + DRB+TEAM_R+ OFG + 0.45*OFT ptsall_poss=OPTS/acq_poss #ALTERNATIVE SCENARIO CHOOSE the SHOCK FTA_AL=mean(regeff$FTA) TOV_AL=mean(regeff$TOV) ORB_AL=mean(regeff$ORB) X3P_AL=mean(regeff$X3P)+1 X2P_AL=mean(regeff$X2P) FGMISS_AL=mean(regeff$FGMISS) FGA_AL=X3P_AL+X2P_AL+FGMISS_AL FT_AL=mean(regeff$FT) OTOV_AL=mean(regeff$OTOV) DRB_AL=mean(regeff$DRB) TEAM_R_AL=mean(regeff$TEAM_R) OFG_AL=mean(regeff$OFG) OFT_AL=mean(regeff$OFT) OX3P_AL=mean(regeff$O3P) OX2P_AL=mean(regeff$O2P) OFT_AL=mean(regeff$OFT) # identities PTS_AL=3*X3P_AL+2*X2P_AL+1*FT_AL empl_poss_AL=FGA_AL + 0.44*FTA_AL + TOV_AL - ORB_AL pts_poss_AL=PTS_AL/empl_poss_AL OPTS_AL=3*OX3P_AL+2*OX2P_AL+1*OFT_AL acq_poss_AL=OTOV_AL + DRB_AL+TEAM_R_AL+ OFG_AL + 0.45*OFT_AL ptsall_poss_AL=OPTS_AL/acq_poss_AL # preparing to simulate two scenarios simulation nrep <- 10^3 tT <- 1 # 1 obs W_BL <- W_AL<- VAL_STAT <- array(0, c(tT, nrep)) # the containers for (i in 1:nrep){ ### Monte Carlo # standard normal times the standard error of the estimatedof the relevant regression x <- rt( n=tT, df=314 )*summary(reg_3)$coefficients[2,2] # simulating wins under the two scenarios and the effect of a stat W_BL[, i] <- reg_3$coefficients[1] +(reg_3$coefficients[2]+x)*(pts_poss-ptsall_poss) W_AL[, i] <- reg_3$coefficients[1] +(reg_3$coefficients[2]+x)*(pts_poss_AL-ptsall_poss_AL) } VAL=W_AL-W_BL hist(VAL, breaks = seq(min(VAL), max(VAL), l = 20+1),prob=TRUE, main = "histogram of effects") curve(dnorm(x,mean=mean(VAL),sd=sd(VAL)),col='darkblue',lwd=2,add=TRUE) ## ------------------------------------------------------------------------ # MODEL REFINEMENTS ## ------------------------------------------------------------------------ #PERSONAL FOULS AND BLOCKED SHOTS Reg_PF = lm(regeff$OFT ~ regeff$PF + regeff$Team+regeff$Season_dum) summary(Reg_PF) Reg_BLK = lm(regeff$O2P ~ regeff$O2PA+regeff$BLK + regeff$Team+regeff$Season_dum) summary(Reg_BLK)