#setwd(path)
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
#clear the environment
rm(list=ls())

# install and load the relevant packages
# packages used
listofpackages <- c("dplyr","assertthat","bindrcpp","glue","pkgconfig","utf8","cli","ellipse","reshape2","ggplot2","dygraphs","aod")

for (j in listofpackages){
  if(sum(installed.packages()[, 1] == j) == 0) {
    install.packages(j)
  }
  library(j, character.only = T)
}

NBAdata=read.csv("../data_L_2020/Teams_overall2026.csv", header = T, stringsAsFactors = F, sep = ",")
head(NBAdata)
typeof(NBAdata)#to check the type of data

#remember that season 200x/200(x+1) is coded as 200(x+1) and Oklahoma City Thunder were Seattle Supersonics until 2007

## ------------------------------------------------------------------------
#SAMPLE CHOICE 
## ------------------------------------------------------------------------
#here we choose sample to replicate Berri et al. Wages of wins 
NBAdata = subset(NBAdata,(Season>1993 & Season<2006 & Season!=1999))
## ------------------------------------------------------------------------
#DATA TRANSFORMATION 
## ------------------------------------------------------------------------
NBAdata$FGMISS=NBAdata$FGA-NBAdata$FG
NBAdata$FTMISS=NBAdata$FTA-NBAdata$FT
NBAdata$OFGMISS=NBAdata$OFGA-NBAdata$OFG
NBAdata$OFTMISS=NBAdata$OFTA-NBAdata$OFT
NBAdata$MISS=NBAdata$FGMISS+NBAdata$FTMISS
NBAdata$OMISS=NBAdata$OFGMISS+NBAdata$OFTMISS
NBAdata$W.=NBAdata$W/(NBAdata$W+NBAdata$L)
#NBA EFFICIENCY MEASURES
NBAdata$eff_NBA=NBAdata$PTS+NBAdata$TRB+NBAdata$STL+NBAdata$BLK-NBAdata$MISS-NBAdata$TOV
NBAdata$eff_O_NBA=NBAdata$OPTS+NBAdata$OTRB+NBAdata$OSTL+NBAdata$OBLK-NBAdata$OMISS-NBAdata$OTOV
NBAdata$rel_effNBA=NBAdata$eff_NBA-NBAdata$eff_O_NBA
NBAdata$PTS_diff=NBAdata$PTS-NBAdata$OPTS
NBAdata$TRB_diff=NBAdata$TRB-NBAdata$OTRB
NBAdata$STL_diff=NBAdata$STL-NBAdata$OSTL
NBAdata$MISS_diff=NBAdata$MISS-NBAdata$OMISS
NBAdata$TOV_diff=NBAdata$TOV-NBAdata$OTOV 
NBAdata$BLK_diff=NBAdata$BLK-NBAdata$OBLK
NBAdata$PF_diff=NBAdata$PF-NBAdata$OPF
NBAdata$AST_diff=NBAdata$AST-NBAdata$OAST


#CORRELATION ANALYSIS

datacor00 = subset(NBAdata,select=c(W.,DRB,FGMISS,AST,PTS,TOV,PF,ORB,STL,BLK,FTMISS))
datacor00 = na.omit(datacor00)
cor.datacor = cor(datacor00, use="complete.obs")
cor.datacor

# nice graphics presentation of correlations needs package ellipse
ord = order(cor.datacor[1,])
ordered.cor.datacor = cor.datacor[ord, ord]
plotcorr(ordered.cor.datacor, col=cm.colors(11)[5*ordered.cor.datacor + 6])

# even nicer correlation heatmap  needs packages reshape2 ggplot2
cormat = round(cor(datacor00),2)
head(cormat)
melted_cormat = melt(cormat)
head(melted_cormat)
ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) +
  geom_tile()
# Get lower triangle of the correlation matrix
get_lower_tri=function(cormat){
  cormat[upper.tri(cormat)] = NA
  return(cormat)
}
# Get upper triangle of the correlation matrix
get_upper_tri = function(cormat){
  cormat[lower.tri(cormat)] = NA
  return(cormat)
}
upper_tri = get_upper_tri(cormat)
upper_tri
# Melt the correlation matrix
melted_cormat = melt(upper_tri, na.rm = TRUE)
# Heatmap
ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+
  geom_tile(color = "white")+
  scale_fill_gradient2(low = "blue", high = "red", mid = "white",
                       midpoint = 0, limit = c(-1,1), space = "Lab",
                       name="Pearson\nCorrelation") +
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 45, vjust = 1,
                                   size = 12, hjust = 1))+
  coord_fixed()

## ------------------------------------------------------------------------
# REGRESSIONS WITHOUT A MODEL 
## ------------------------------------------------------------------------

reg_op1 = lm(W~ PTS_diff+TRB_diff+STL_diff+BLK_diff+MISS_diff+TOV_diff,NBAdata )
summary(reg_op1)

reg_up1 = lm(W~rel_effNBA,NBAdata)
summary(reg_up1)

plot(NBAdata$W, pch=20, ylim=c(10, 82),
     xaxt="n", xlab="Team and Time", ylab="Wins",col = "blue")
lines(reg_op1$fitted.values,col = "red",pch=8)
lines(reg_up1$fitted.values,col = "green",pch=8)

plot(reg_op1$residuals, pch=20, ylim=c(-20, 20),
     xaxt="n", xlab="Team and Time", ylab="Wins",col = "blue")
lines(reg_up1$residuals,col = "red",pch=8)

## ------------------------------------------------------------------------
# REGRESSIONS WITH A MODEL  
## ------------------------------------------------------------------------

## ------------------------------------------------------------------------
#DATA TRANSFORMATION
## ------------------------------------------------------------------------

#EMPLOYED POSSESSIONS
NBAdata$empl_poss=NBAdata$FGA + 0.44*NBAdata$FTA + NBAdata$TOV - NBAdata$ORB
NBAdata$ptsxgame=NBAdata$PTS/NBAdata$G
NBAdata$ptsxposs=NBAdata$PTS/NBAdata$empl_poss


#ACQUIRED POSSESSIONS
#Constructing Team Rebounds 
NBAdata$FGAD=NBAdata$FGA-NBAdata$OFG-NBAdata$OTOV-NBAdata$TRB+NBAdata$TOV
NBAdata$TEAM_R=NBAdata$FGAD-0.45*NBAdata$OFT+0.45*NBAdata$FTA
#using Team Rebounds
NBAdata$acq_poss=NBAdata$OTOV + NBAdata$DRB+NBAdata$TEAM_R+ NBAdata$OFG + 0.45*NBAdata$OFT
NBAdata$ptsaxgame=NBAdata$OPTS/NBAdata$G
NBAdata$ptsall_poss=NBAdata$OPTS/NBAdata$acq_poss
NBAdata$eff=(NBAdata$ptsxposs-NBAdata$ptsall_poss)

#CHECKING TEAM REBOUNDS CONSTRUCTION 
plot(x = NBAdata$Season, y = NBAdata$TEAM_R, main = "Estimated Team Rebounds", ylab = "TR",ylim = c(200,4000), xlab = "Seasons",col = "red")
points(x = NBAdata$Season, y = NBAdata$DRB,col = "blue")

reg_1 = lm(FGAD ~ OFT + FTA,NBAdata )
summary(reg_1)
#Hypothesis Testing
wald.test(vcov(reg_1), coef(reg_1), H0 = c(0.45, -0.45), df = 313,
          L = matrix(c(0, 0, 1, 0,
                       0, 1), ncol = 3), verbose = T)

#USING Regressions results to create proxy
NBAdata$TEAM_R1=coef(reg_1)[1]+reg_1$residuals

plot(x = NBAdata$Season, y = NBAdata$TEAM_R, main = "Estimated Team Rebounds", ylab = "TR",ylim = c(200,600), xlab = "Seasons",col = "red")
points(x = NBAdata$Season, y = NBAdata$TEAM_R1,col = "blue")

#CHECKING ACQUIRED AND EMPLOYED POSSESSIONS 
plot(x = NBAdata$Season, y = NBAdata$acq_poss, main = "Estimated Possession", ylab = "TR",ylim = c(6000,10000), xlab = "Seasons",col = "red")
points(x = NBAdata$Season, y = NBAdata$empl_poss,col = "blue")
DATA_AH=subset(NBAdata,(Team== "Atlanta Hawks"),select=c(Season,acq_poss,empl_poss))
plot(x = DATA_AH$Season, y = DATA_AH$acq_poss, main = "Atlanta Hawks", ylab = "TPOSS",ylim = c(6000,10000), xlab = "Seasons",col = "red")
points(x = DATA_AH$Season, y = DATA_AH$empl_poss,col = "blue")

## ------------------------------------------------------------------------
# DESCRIPTIVE ANALYSIS
## ------------------------------------------------------------------------

#What we want to see now is the most efficient team in employing possessions 
TABLE_6.2=subset(NBAdata,(Season== 2005),select=c(Team,ptsxposs,ptsxgame))
ranking6_2PTSxP = TABLE_6.2[order(TABLE_6.2$ptsxposs, TABLE_6.2$Team, decreasing = TRUE),]
#similarly we rank teams wrt points scored per game
ranking6_2PTSxG = TABLE_6.2[order(TABLE_6.2$ptsxgame, TABLE_6.2$Team, decreasing = TRUE),]
#same ranking for the acquired possessions
TABLE_6.3=subset(NBAdata,(Season== 2005),select=c(Team,ptsall_poss,ptsaxgame))
ranking6_3PTSAxPA = TABLE_6.3[order(TABLE_6.3$ptsall_poss, TABLE_6.3$Team, decreasing = FALSE),]
ranking6_3PTSAxG = TABLE_6.3[order(TABLE_6.3$ptsaxgame, TABLE_6.3$Team, decreasing = FALSE),]

# MODEL ESTIMATION measuring the impact of defensive and offensive efficiency

NBAdata$Season_dum=as.character(NBAdata$Season)
plot(x = NBAdata$Season, y = NBAdata$W, main = "WINS", ylab = "WINS",ylim = c(0,85), xlab = "Seasons",col = "red")
#The WINSCORE MODEL

reg_WSU = lm(W ~ ptsall_poss+ptsxposs, NBAdata )
summary(reg_WSU)
reg_WS = lm(W ~ eff, NBAdata )
summary(reg_WS)

#evaluating the model against the standard NBA efficiency model 

reg_test=lm(W ~  eff+rel_effNBA,NBAdata )
summary(reg_test)

plot(NBAdata$W, pch=20, ylim=c(15, 82), ylab="Wins",col = "blue")
lines(reg_WS$fitted.values,col = "red", lwd = 2,type="l")
# storing the residuals
resid_WS <- reg_WS$residuals

## ------------------------------------------------------------------------
# MODEL SIMULATION
## ------------------------------------------------------------------------

## ------------------------------------------------------------------------
# DETERMINISTIC  SIMULATION
## ------------------------------------------------------------------------
#baseline no shock, specify values for each stats 

FTA=mean(NBAdata$FTA)
TOV=mean(NBAdata$TOV)
ORB=mean(NBAdata$ORB)
X3P=mean(NBAdata$X3P)
X2P=mean(NBAdata$X2P)
FGMISS=mean(NBAdata$FGMISS)
FGA=X3P+X2P+FGMISS
FT=mean(NBAdata$FT)
OTOV=mean(NBAdata$OTOV)
DRB=mean(NBAdata$DRB)
TEAM_R=mean(NBAdata$TEAM_R)
OFG=mean(NBAdata$OFG)
OFT=mean(NBAdata$OFT)
OX3P=mean(NBAdata$O3P)
OX2P=mean(NBAdata$O2P)
OFT=mean(NBAdata$OFT)

# identities
PTS=3*X3P+2*X2P+1*FT
empl_poss=FGA + 0.44*FTA + TOV - ORB
pts_poss=PTS/empl_poss
OPTS=3*OX3P+2*OX2P+1*OFT
acq_poss=OTOV + DRB+TEAM_R+ OFG + 0.45*OFT
ptsall_poss=OPTS/acq_poss

#ALTERNATIVE SCENARIO CHOOSE the SHOCK

FTA_AL=mean(NBAdata$FTA)
TOV_AL=mean(NBAdata$TOV)
ORB_AL=mean(NBAdata$ORB)
X3P_AL=mean(NBAdata$X3P)+1
X2P_AL=mean(NBAdata$X2P)
FGMISS_AL=mean(NBAdata$FGMISS)
FGA_AL=X3P_AL+X2P_AL+FGMISS_AL
FT_AL=mean(NBAdata$FT)
OTOV_AL=mean(NBAdata$OTOV)
DRB_AL=mean(NBAdata$DRB)
TEAM_R_AL=mean(NBAdata$TEAM_R)
OFG_AL=mean(NBAdata$OFG)
OFT_AL=mean(NBAdata$OFT)
OX3P_AL=mean(NBAdata$O3P)
OX2P_AL=mean(NBAdata$O2P)
OFT_AL=mean(NBAdata$OFT)

# identities
PTS_AL=3*X3P_AL+2*X2P_AL+1*FT_AL
empl_poss_AL=FGA_AL + 0.44*FTA_AL + TOV_AL - ORB_AL
pts_poss_AL=PTS_AL/empl_poss_AL
OPTS_AL=3*OX3P_AL+2*OX2P_AL+1*OFT_AL
acq_poss_AL=OTOV_AL + DRB_AL+TEAM_R_AL+ OFG_AL + 0.45*OFT_AL
ptsall_poss_AL=OPTS_AL/acq_poss_AL

# now add the last equation and simulate WINS under the two scenarios 
W_BL <- reg_WS$coefficients[1] +reg_WS$coefficients[2]*(pts_poss-ptsall_poss)
W_AL <- reg_WS$coefficients[1] +reg_WS$coefficients[2]*(pts_poss_AL-ptsall_poss_AL)
VAL=W_AL-W_BL
VAL 


## ------------------------------------------------------------------------
# STOCHASTIC  SIMULATION
## ------------------------------------------------------------------------

# preparing to simulate two scenarios  simulation
nrep <- 10^3
tT <- 1 # 1 obs
W_BL <- W_AL<- VAL_STAT <- array(0, c(tT, nrep)) # the containers

for (i in 1:nrep){

    ### Monte Carlo
  # t distribution times the standard error of the relevant estimated coeff
    x <- rt( n=tT, df=314 )*summary(reg_WS)$coefficients[2,2]

  # simulating wins under the two scenarios and the effect of a stat
 
 W_BL[, i] <- reg_WS$coefficients[1] +(reg_WS$coefficients[2]+x)*(pts_poss-ptsall_poss)
  W_AL[, i] <- reg_WS$coefficients[1] +(reg_WS$coefficients[2]+x)*(pts_poss_AL-ptsall_poss_AL)

}

VAL=W_AL-W_BL
hist(VAL, breaks = seq(min(VAL), max(VAL), l = 20+1),prob=TRUE, main = "histogram of effects")
curve(dnorm(x,mean=mean(VAL),sd=sd(VAL)),col='darkblue',lwd=2,add=TRUE)
## ------------------------------------------------------------------------
# MODEL REFINEMENTS 
## ------------------------------------------------------------------------

#PERSONAL FOULS AND BLOCKED SHOTS
Reg_PF_OLS = lm(OFT ~ PF,NBAdata)
summary(Reg_PF_OLS)
Reg_PF = lm(OFT ~ PF + Team+Season_dum,NBAdata)
summary(Reg_PF)
Reg_BLK = lm(O2P ~ O2PA+BLK + Team+Season_dum,NBAdata)
summary(Reg_BLK)
NBAdata$eff_DM=NBAdata$eff-mean(NBAdata$eff)
NBAdata$F_ABF=NBAdata$AST_diff+NBAdata$BLK_diff-NBAdata$PF_diff

reg_WS = lm(W ~ eff_DM, NBAdata )
summary(reg_WS)
reg_WS_aug = lm(W ~ eff_DM+F_ABF, NBAdata )
summary(reg_WS_aug)

## ------------------------------------------------------------------------
# HOW TO USE WINSCORE TO PREDICT WINS 
## ------------------------------------------------------------------------


NBAdata$W_scor = NBAdata$X3P*0.066+NBAdata$X2P*0.033+NBAdata$FT*0.018 - (NBAdata$FGA-NBAdata$FG)*0.034 - (NBAdata$FTA-NBAdata$FT)*0.015
NBAdata$W_pos_st = 0.034*(NBAdata$ORB - NBAdata$TOV + NBAdata$DRB + NBAdata$STL)
NBAdata$W_pfblk = NBAdata$BLK*0.021 - NBAdata$PF*0.018
NBAdata$W_pfblka = NBAdata$AST*0.022+NBAdata$BLK*0.021 - NBAdata$PF*0.018
NBAdata$W_all = NBAdata$W_scor+NBAdata$W_pos_st+NBAdata$W_pfblk
NBAdata$W_all_a = NBAdata$W_scor+NBAdata$W_pos_st+NBAdata$W_pfblka

ggplot(NBAdata, aes(x = W_all)) +
  geom_point(aes(y = W), color = "blue", alpha = 0.6, size = 2, shape = 16) +  # Actual values
  labs(title = "Wins vs Predicted Wins",
       x = "W_allc",
       y = "W",
       caption = "Blue dots: actual values") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

NBAdata$OW_scor = NBAdata$O3P*0.066+NBAdata$O2P*0.033+NBAdata$OFT*0.018 - (NBAdata$OFGA-NBAdata$OFG)*0.034 - (NBAdata$OFTA-NBAdata$OFT)*0.015
NBAdata$OW_pos_st = 0.034*(NBAdata$OORB - NBAdata$OTOV + NBAdata$ODRB + NBAdata$OSTL)
NBAdata$OW_pfblk = NBAdata$OBLK*0.021 - NBAdata$OPF*0.018
NBAdata$OW_pfblka = NBAdata$OAST*0.022+NBAdata$OBLK*0.021 - NBAdata$OPF*0.018
NBAdata$OW_all = NBAdata$OW_scor+NBAdata$OW_pos_st+NBAdata$OW_pfblk
NBAdata$OW_all_a = NBAdata$OW_scor+NBAdata$OW_pos_st+NBAdata$OW_pfblka

NBAdata$W_allc = NBAdata$W_all -NBAdata$OW_all+41
NBAdata$W_allc_a = NBAdata$W_all_a -NBAdata$OW_all_a+41

ggplot(NBAdata, aes(x = W_allc)) +
  geom_point(aes(y = W), color = "blue", alpha = 0.6, size = 2, shape = 16) +  # Actual values
  labs(title = "Wins vs Predicted Wins",
       x = "W_allc",
       y = "W",
       caption = "Model without assists") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

ggplot(NBAdata, aes(x = W_allc_a)) +
  geom_point(aes(y = W), color = "blue", alpha = 0.6, size = 2, shape = 16) +  # Actual values
  labs(title = "Wins vs Predicted Wins",
       x = "W_allc",
       y = "W",
       caption = "Model with assists") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

ggplot(NBAdata, aes(x = W_allc)) +
  # Actual values
  geom_point(aes(y = W), color = "blue", alpha = 0.6, size = 2) +
  
  # Highlight Los Angeles Lakers with green points and labels
  geom_point(data = subset(NBAdata, Team == "Los Angeles Lakers"),
             aes(y = W), color = "darkgreen", size = 3) +
  geom_text(data = subset(NBAdata, Team == "Los Angeles Lakers"),
            aes(y = W, label = "LAL"), color = "darkgreen", vjust = -1, size = 3) +
  
  # Labels and theme
  labs(title = "Wins vs Predicted Wins with LAL Highlighted",
       x = "W_allc",
       y = "W",
       caption = "Green points: Boston Celtics (BC)") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))






