#setwd(path)
setwd(dirname(dirname(rstudioapi::getActiveDocumentContext()$path)))
#clear the environment
rm(list=ls())

# install and load the relevant packages 
# packages used
listofpackages <- c("dplyr","assertthat","bindrcpp","glue","pkgconfig","utf8","cli","ellipse","reshape2","ggplot2","dygraphs","aod")

for (j in listofpackages){
  if(sum(installed.packages()[, 1] == j) == 0) {
    install.packages(j)
  }
  library(j, character.only = T)
}


#Let's now work with a database considering stats about players in the season 2003-2004
Players = read.csv("../data_L_2020/Players03-04.csv", header = T, stringsAsFactors = F, sep = ";")
Players$MFG = Players$FGA - Players$FG
Players$MFT = Players$FTA - Players$FT
Players$PTSxgame = Players$PTS/Players$G
Players$PTSxminute = Players$PTS/Players$MP

# Players <- Players %>%
#   group_by(Tm) %>%
#   slice_max(order_by = MP, n = 12, with_ties = FALSE) %>%
#   ungroup()

# table Berri page 154
# Players <- Players %>%
#   filter(MP > 2000, PTSxminute > 0.5)
  
#NBA MEASURE OF EFFICIENCY 

Players$eff_NBA=Players$PTS+Players$TRB+Players$STL+Players$BLK-Players$MFG-Players$MFT-Players$TOV

#WINSCORE MEASURE OF EFFICIENCY 

#Step 1
Players$W_scor = Players$X3P*0.066+Players$X2P*0.033+Players$FT*0.018 - Players$MFG*0.034 - Players$MFT*0.015
Players$W_pos_st = 0.034*(Players$ORB - Players$TOV + Players$DRB + Players$STL)
Players$W_pfblk = Players$BLK*0.021 - Players$PF*0.018
Players$W_ast= 0.022*Players$AST
Players$W_all = Players$W_scor+Players$W_pos_st+Players$W_pfblk#+Players$W_ast



#here we insert other 2 variables to the database, as they will become useful later on
Players$MP. = Players$MP/(48*82)  
Players$AVEPLW = Players$MP.*(0.5*82/5) # the Average players makes a team win half of the games '
Players$W_all_100=Players$W_all/(Players$MP.) # contribution to wins in the hypothetical case you always play  

#Step 2, accounting for the position. Calculate means for every position for an average player always on court and consider the relative difference btw one player and the average one
#library(dplyr)
Players_pos = group_by(Players, Pos)
Wmeanxpos <- summarise(Players_pos, WTmeanxpos
                       = mean(W_all_100))

#then subtract the relevant adjustment to each players by rescaling for his time on court  
Players$adj=1
Players$adj[(Players$Pos == "C") ]<-Players$MP.[(Players$Pos == "C") ]*Wmeanxpos$WTmeanxpos[1]
Players$adj[(Players$Pos == "C-PF") ]<-Players$MP.[(Players$Pos == "C-PF") ]*Wmeanxpos$WTmeanxpos[2]
Players$adj[(Players$Pos == "PF") ]<-Players$MP.[(Players$Pos == "PF") ]*Wmeanxpos$WTmeanxpos[3]
Players$adj[(Players$Pos == "PF-C") ]<-Players$MP.[(Players$Pos == "PF-C") ]*Wmeanxpos$WTmeanxpos[4]
Players$adj[(Players$Pos == "PF-SF") ]<-Players$MP.[(Players$Pos == "PF-SF") ]*Wmeanxpos$WTmeanxpos[5]
Players$adj[(Players$Pos == "PG") ]<-Players$MP.[(Players$Pos == "PG") ]*Wmeanxpos$WTmeanxpos[6]
Players$adj[(Players$Pos == "PG-SG") ]<-Players$MP.[(Players$Pos == "PG-SG") ]*Wmeanxpos$WTmeanxpos[7]
Players$adj[(Players$Pos == "SF") ]<-Players$MP.[(Players$Pos == "SF") ]*Wmeanxpos$WTmeanxpos[8]
Players$adj[(Players$Pos == "SF-PF") ]<-Players$MP.[(Players$Pos == "SF-PF") ]*Wmeanxpos$WTmeanxpos[9]  
Players$adj[(Players$Pos == "SF-SG") ]<-Players$MP.[(Players$Pos == "SF-SG") ]*Wmeanxpos$WTmeanxpos[10]  
Players$adj[(Players$Pos == "SG") ]<-Players$MP.[(Players$Pos == "SG") ]*Wmeanxpos$WTmeanxpos[11]  
Players$adj[(Players$Pos == "SG-SF") ]<-Players$MP.[(Players$Pos == "SG-SF") ]*Wmeanxpos$WTmeanxpos[12]  


Players$W_all_ua=Players$W_all-Players$adj+Players$AVEPLW
Players$W_48_ua=(Players$W_all_ua/Players$MP)*48

#replicate Table 6_6

TABLE_6.6.1=subset(Players,(Tm=="LAL" ),select=c(Player,Pos,MP.,Tm,W_all,W_all_ua,W_48_ua))
TABLE_6.6.2=subset(Players,(Tm=="MIA" ),select=c(Player,Pos,MP.,Tm,W_all,W_all_ua,W_48_ua))
table_6.6=rbind(TABLE_6.6.1, TABLE_6.6.2)
rank.table_6.6 = table_6.6[order(table_6.6$W_all, table_6.6$Tm, decreasing = TRUE),]

#replicate table 6.7 
TABLE_6.7.1=subset(Players,(Player=="Kobe Bryant"),select=c(Player,MP,Tm,W_scor,W_pos_st,W_pfblk,W_all,W_all_ua,AST,W_48_ua))
TABLE_6.7.2=subset(Players,(Player=="Shaquille O'Neal*"),select=c(Player,MP,Tm,W_scor,W_pos_st,W_pfblk,W_all,W_all_ua,AST,W_48_ua))
table_6.7=rbind(TABLE_6.7.1, TABLE_6.7.2)


#here is a new idea I adjust with a different average player 
# for each player:
#  Select players with the same Pos,
# Among those, select players with MP. between 90% and 110% of the player’s own MP,
# Take the average W_ALL over that subset.

Players <- Players %>%
  rowwise() %>%
  mutate(
    adj2 = mean(
      Players$W_all[
        Players$Pos == Pos & 
          Players$MP >= 0.9 * MP & 
          Players$MP <= 1.1 * MP
      ],
      na.rm = TRUE
    )
  ) %>%
  ungroup()

Players$W_all_ua1=Players$W_all-Players$adj2+Players$AVEPLW
Players$W_48_ua1=(Players$W_all_ua1/Players$MP)*48
# table Berri page 154
top15_players <- Players %>%
  filter(MP > 2000, PTSxminute > 0.5) %>%
  arrange(desc(W_all_ua)) %>%
  slice_head(n = 15) %>%
  select(Player, W_all_ua, W_48_ua, PTSxgame, PTSxminute)

TABLE_LAL=subset(Players,(Tm=="LAL" ),select=c(Player,Pos,MP.,Tm,W_all,W_all_ua, W_all_ua1, W_48_ua))
TABLE_LAL_SOR = TABLE_LAL[order(TABLE_LAL$W_all_ua1, TABLE_LAL$Player, decreasing = TRUE),]
LAL_WP=sum(TABLE_LAL$W_all_ua1)
LAL_MP=sum(TABLE_LAL$MP.)
NBAdata=read.csv("../data_L_2020/Teams_overall2026.csv", header = T, stringsAsFactors = F, sep = ",")
NBAdata$W[NBAdata$Team == "Los Angeles Lakers" & NBAdata$Season == "2005"]
NBAdata$Team[NBAdata$Season == "2005"]

sum(TABLE_LAL$W_all_ua1)
sum(TABLE_LAL$W_all_ua)

# Tabella con wins produced
players_clean <- Players %>% 
  filter(Tm != "TOT")

team_scores <- players_clean %>%
  group_by(Tm) %>%
  summarise(
    Winscore_Berri = sum(W_all_ua,  na.rm = TRUE),
    Winscore_mod   = sum(W_all_ua1, na.rm = TRUE),
    .groups = "drop"
  )

team_abbrev_mapping <- c(
  POR = "Portland Trail Blazers",
  CHA = "Charlotte Bobcats",
  SEA = "Seattle SuperSonics",
  BOS = "Boston Celtics",
  TOR = "Toronto Raptors",
  NOH = "New Orleans Hornets",
  DEN = "Denver Nuggets",
  WAS = "Washington Wizards",
  NYK = "New York Knicks",
  LAL = "Los Angeles Lakers",
  ORL = "Orlando Magic",
  PHO = "Phoenix Suns",
  SAC = "Sacramento Kings",
  SAS = "San Antonio Spurs",
  MEM = "Memphis Grizzlies",
  UTA = "Utah Jazz",
  IND = "Indiana Pacers",
  NJN = "New Jersey Nets",
  GSW = "Golden State Warriors",
  DET = "Detroit Pistons",
  HOU = "Houston Rockets",
  DAL = "Dallas Mavericks",
  LAC = "Los Angeles Clippers",
  PHI = "Philadelphia 76ers",
  MIA = "Miami Heat",
  MIN = "Minnesota Timberwolves",
  CHI = "Chicago Bulls",
  ATL = "Atlanta Hawks",
  CLE = "Cleveland Cavaliers",
  MIL = "Milwaukee Bucks"
)

team_scores <- team_scores %>%
  mutate(Team = team_abbrev_mapping[Tm])

# 4) Pull out actual wins from your NBAdata for 2004 and join
team_summary <- team_scores %>%
  left_join(
    NBAdata %>% 
      filter(Season == "2005") %>%
      select(Team, Actual_Wins = W),
    by = "Team"
  )

# Determine shared limits
lims <- range(c(team_summary$Actual_Wins, team_summary$Winscore_mod), na.rm = TRUE)

# Create the plot
lin_plot <-ggplot(team_summary, aes(x = Actual_Wins)) +
  # Add the 45-degree line
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed", size = 1) +
  geom_point(aes(y = Winscore_mod), color = "blue", alpha = 0.6, size = 2) +
  geom_point(aes(y = Winscore_mod), color = "blue", alpha = 0.6, size = 2) +
    # Set same limits AND lock the aspect ratio
  xlim(lims) +
  ylim(lims) +
  coord_fixed(ratio = 1) +
  
  labs(title = "Actual and Predicted Wins",
       y = "Predicted Wins",
       x = "Wins",
       caption = "") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))
print(lin_plot)
ggsave("pred_wins.pdf", lin_plot, width = 8, height = 6)

#adjusting for assist
#useful exercise 
#adjust for assists 
#Players$W_all_AST=Players$W_all_ua1+0.022*Players$AST

#Players$W_all_ua1=Players$W_all-Players$adj2+Players$adjMP
#Players$W_48_ua1=(Players$W_all_ua1/Players$MP)*48

#check that unassisted win is a  predictor of assisted win 
#reg_wins=lm(Players$W_all_AST ~  Players$W_all_ua1)
#summary(reg_wins)

# rank players according W_48 and salary 
WAGEPROD=subset(Players,MP >= 1000 & G >= 30,select=c(Player,Tm,W_48_ua, W_48_ua1,W_all_ua,W_all_ua1, eff_NBA,SALARY))

WAGEPROD <- na.omit(WAGEPROD)
WAGEPROD$l_SALARY=log(WAGEPROD$SALARY)
scatter.smooth(x=WAGEPROD$W_all_ua1, y=WAGEPROD$l_SALARY, main="W48 ~ log(W)")  # scatterplot

reg_prod=lm(WAGEPROD$l_SALARY ~  WAGEPROD$W_all_ua1 )
summary(reg_prod)
plot(WAGEPROD$l_SALARY, pch=20, ylim=c(12, 18), ylab="Salary",col = "blue")
lines(reg_prod$fitted.values,col = "red", lwd = 2,type="l")

reg_prod_1=lm(WAGEPROD$l_SALARY ~  WAGEPROD$eff_NBA )
summary(reg_prod_1)
plot(WAGEPROD$l_SALARY, pch=20, ylim=c(12, 18), ylab="Salary",col = "blue")
lines(reg_prod_1$fitted.values,col = "red", lwd = 2,type="l")

reg_prod_test=lm(WAGEPROD$l_SALARY ~  reg_prod$fitted.values+reg_prod_1$fitted.values -1)
summary(reg_prod_test)


