#clear the environment 
rm(list=ls()) 
## ------------------------------------------------------------------------
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
## -------

# load packages 
listofpackages = c("ellipse","reshape2","ggplot2","dygraphs", "plyr","dplyr")

for (j in listofpackages){
  if(sum(installed.packages()[, 1] == j) == 0) {
    install.packages(j)
  }
  library(j, character.only = T)
}
## ------------------------------------------------------------------------
## INPUT DATA 
## ------------------------------------------------------------------------
NBAdata=read.csv("Teams_overall2020.csv", header = T, stringsAsFactors = F, sep = ",")
head(NBAdata)

## ------------------------------------------------------------------------
## DATA PREVIEW 
## ------------------------------------------------------------------------
typeof(NBAdata)#to check the type of data
names(NBAdata)
dim(NBAdata)
classes.NBAdata = lapply(NBAdata, class)
NBAdata = na.omit(NBAdata)
## ------------------------------------------------------------------------
## DATA SUBSETTING
## ------------------------------------------------------------------------
sel.vars <- c("Team", "Season", "W", "L")
NBAdata.new = NBAdata[,sel.vars]
head(NBAdata.new)
TABLE_1=subset(NBAdata,(Team== "Chicago Bulls"),select=sel.vars)
TABLE_1=subset(NBAdata,(Team== "Chicago Bulls"),select=c(Team,Season,W,L))

TABLE_2=subset(NBAdata,(Season== 2005),select=c(Team,PTS,OPTS,W))
rank_TABLE_2 = TABLE_2[order(TABLE_2$PTS, decreasing = TRUE),]
rank_TABLE_2
rownames(rank_TABLE_2) <- NULL

## ------------------------------------------------------------------------
## DATA GROUPING
## ------------------------------------------------------------------------

# using ddply() function (from plyr package) 
avgPTSxSEA = ddply(NBAdata, c("Season"), summarise, PTS = mean(PTS))
plot(avgPTSxSEA)

TEAMSxSEAS = ddply(NBAdata, c("Season"), summarise, TEAMS = length(PTS))
plot(TEAMSxSEAS)
## ------------------------------------------------------------------------
## GRAPH AND DATA ANALYSIS 
## ------------------------------------------------------------------------
#DATA TRANSFORMATION 
NBAdata <- na.omit(NBAdata)
NBAdata$FGMISS=NBAdata$FGA-NBAdata$FG
NBAdata$FTMISS=NBAdata$FTA-NBAdata$FT
NBAdata$OFGMISS=NBAdata$OFGA-NBAdata$OFG
NBAdata$OFTMISS=NBAdata$OFTA-NBAdata$OFT
NBAdata$MISS=NBAdata$FGMISS+NBAdata$FTMISS
NBAdata$OMISS=NBAdata$OFGMISS+NBAdata$OFTMISS
NBAdata$W.=NBAdata$W/(NBAdata$W+NBAdata$L)
#create employed possessions 
NBAdata$empl_poss=NBAdata$FGA + 0.44*NBAdata$FTA + NBAdata$TOV - NBAdata$ORB
NBAdata$ptsxgame=NBAdata$PTS/NBAdata$G
NBAdata$ptsxposs=NBAdata$PTS/NBAdata$empl_poss
# qq-plot versus normal dist 
qqnorm(NBAdata$ptsxgame,
       ylim = c(70,130),ylab = "PTSx game. sample quantile",
       xlab = "PTSxgame. theoretical quantiles",
       main = "Normal (Q-Q plot)")
qqline(NBAdata$ptsxgame, datax = FALSE, distribution = qnorm,
       probs = c(0.25, 0.75), qtype = 7)
s1=NBAdata$ptsxgame
hist(s1, breaks = seq(min(s1), max(s1), l = 20+1),prob=TRUE, main = "histogram of points per game")
curve(dnorm(x,mean=mean(s1),sd=sd(s1)),col='darkblue',lwd=2,add=TRUE)
#CORRELATION 
datacor8099 = subset(NBAdata,(Season<2000 & Season>1979),select=c(W,X3P.,X2P.))
cor.datacor = cor(datacor8099, use="complete.obs")
cor.datacor

# nice graphics presentation of correlations needs package ellipse 
ord = order(cor.datacor[1,])
ordered.cor.datacor = cor.datacor[ord, ord]
plotcorr(ordered.cor.datacor, col=cm.colors(11)[5*ordered.cor.datacor + 6])

# even nicer correlation heatmap  needs packages reshape2 ggplot2
cormat = round(cor(datacor8099),2)
head(cormat)
melted_cormat = melt(cormat)
head(melted_cormat)
ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) + 
  geom_tile()
# Get lower triangle of the correlation matrix
get_lower_tri=function(cormat){
  cormat[upper.tri(cormat)] = NA
  return(cormat)
}
# Get upper triangle of the correlation matrix
get_upper_tri = function(cormat){
  cormat[lower.tri(cormat)] = NA
  return(cormat)
}
upper_tri = get_upper_tri(cormat)
upper_tri
# Melt the correlation matrix
melted_cormat = melt(upper_tri, na.rm = TRUE)
# Heatmap
ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+
  geom_tile(color = "white")+
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name="Pearson\nCorrelation") +
  theme_minimal()+ 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                   size = 12, hjust = 1))+
  coord_fixed()

## ------------------------------------------------------------------------
## TIME-SERIES PLOT OF GSW pace over the seasons 
## ------------------------------------------------------------------------
#NBAdata$pace=NBAdata$empl_poss/(48*82)
NBAdata$pace=NBAdata$empl_poss/(NBAdata$G*48)
sel.vars <- c( "Season", "pace")
GSW_data=subset(NBAdata,(Team== "Golden State Warriors"),select=sel.vars)
plot(y = GSW_data$pace, x = GSW_data$Season, type = "l",col = "blue",ylim = c(1.8,2.4),ylab = " pace",
     xlab = "Season",
     main = "GSW pace over time ")
lines(y = rep(mean(GSW_data$pace), length(GSW_data$pace)), x = GSW_data$Season, col = "red")
lines(y = rep(mean(GSW_data$pace) + 2*sd(GSW_data$pace), length(GSW_data$pace)), x = GSW_data$Season, col = "red", lty = 2)
lines(y = rep(mean(GSW_data$pace) - 2*sd(GSW_data$pace), length(GSW_data$pace)), x = GSW_data$Season, col = "red", lty = 2)

## ------------------------------------------------------------------------
## SHARE OF 3P over total SHOTS taken by CHICAGO BULLS over time 
## ------------------------------------------------------------------------
NBAdata$X3P_SH=NBAdata$X3PA/(NBAdata$X3PA+NBAdata$X2PA)
NBAdata$X3P_RAT=NBAdata$X3PA/NBAdata$X2PA
NBAdata$PPS_X3P=3*NBAdata$X3P/NBAdata$X3PA
NBAdata$PPS_X2P=2*NBAdata$X2P/NBAdata$X2PA
NBAdata$X3P_PROD=(NBAdata$PPS_X3P)/(NBAdata$PPS_X2P)
sel.vars <- c( "Season", "X3P_SH","X3P_PROD","X3P_RAT","PPS_X3P","PPS_X2P")
CB_data=subset(NBAdata,(Team== "Chicago Bulls"),select=sel.vars)
plot(y = CB_data$X3P_SH, x = CB_data$Season, type = "l",col = "blue",ylim = c(0,0.4),ylab = " 3PSH",
     xlab = "Season",
     main = "Chicago Bulls 3P share over time ")
lines(y = rep(mean(CB_data$X3P_SH), length(CB_data$X3P_SH)), x = CB_data$Season, col = "red")
lines(y = rep(mean(CB_data$X3P_SH) + 2*sd(CB_data$X3P_SH), length(CB_data$X3P_SH)), x = CB_data$Season, col = "red", lty = 2)
lines(y = rep(mean(CB_data$X3P_SH) - 2*sd(CB_data$X3P_SH), length(CB_data$X3P_SH)), x = CB_data$Season, col = "red", lty = 2)
## ------------------------------------------------------------------------
## RELATIVE EFFICIENCY of  3P and 2P SHOTS taken by CHICAGO BULLS over time 
## ------------------------------------------------------------------------

plot(y = CB_data$PPS_X3P, x = CB_data$Season, type = "l",col = "blue",ylim = c(0.5,1.5),ylab = " 3PSH",
     xlab = "Season",
     main = "Chicago Bulls Shooting Distribution ")
lines(y =CB_data$PPS_X2P , x = CB_data$Season, col = "red")

plot(y = CB_data$X3P_PROD, x = CB_data$Season, type = "l",col = "blue",ylim = c(0,2),ylab = " 3PSH",
     xlab = "Season",
     main = "Chicago Bulls Efficiency ")
lines(y =CB_data$X3P_RAT , x = CB_data$Season, col = "red")
legend(2005, 2, legend=c("3P Rel Productivity", "3p RATIO"),
       col=c("blue", "red"), lty=1:2, cex=0.8)



##MODEL SIMULATION

### From 10,000 simulations each derive a probability chart of teams of winning %s of (0.2, 0.5 and 0.8) of achieving win streaks of lengths (3w, 5w, and 7w) in a regular NBA Season.

#----------------------------- 3 WIN STREAKS ---------------------------#
#winning% = 80%
wins1 <- function(m,n)
{
  sapply(1:m, function(o)
  {
    x <- rbinom(n,1,0.80)
    k <- 1:(n-2)
    y <- x[k+2]+x[k+1]+x[k]                    
    a <- min(c(which(y==3),Inf))
    b <- ifelse(a < n+1,1,0)
    b
  })
}
q1 <- wins1(10000,82)
#winning% = 50%
wins2 <- function(m,n)
{
  sapply(1:m, function(o)
  {
    x <- rbinom(n,1,0.5)
    k <- 1:(n-2)
    y <- x[k+2]+x[k+1]+x[k]                    
    a <- min(c(which(y==3),Inf))
    b <- ifelse(a < n+1,1,0)
    b
  })
}
q2 <- wins2(10000,82)
#winning% = 20%
wins3 <- function(m,n)
{
  sapply(1:m, function(o)
  {
    x <- rbinom(n,1,0.2)
    k <- 1:(n-2)
    y <- x[k+2]+x[k+1]+x[k]                    
    a <- min(c(which(y==3),Inf))
    b <- ifelse(a < n+1,1,0)
    b
  })
}
q3 <- wins3(10000,82)

#----------------------------- 5 WIN STREAKS ---------------------------#
#winning% = 80%
wins4 <- function(m,n)
{
  sapply(1:m, function(o)
  {
    x <- rbinom(n,1,0.80)
    k <- 1:(n-4)
    y <- x[k+4]+x[k+3]+x[k+2]+x[k+1]+x[k]                    
    a <- min(c(which(y==5),Inf))
    b <- ifelse(a < n+1,1,0)
    b
  })
}
q4 <- wins4(10000,82)
#winning% = 50%
wins5 <- function(m,n)
{
  sapply(1:m, function(o)
  {
    x <- rbinom(n,1,0.50)
    k <- 1:(n-4)
    y <- x[k+4]+x[k+3]+x[k+2]+x[k+1]+x[k]                    
    a <- min(c(which(y==5),Inf))
    b <- ifelse(a < n+1,1,0)
    b
  })
}
q5 <- wins5(10000,82)
#winning% = 20%
wins6 <- function(m,n)
{
  sapply(1:m, function(o)
  {
    x <- rbinom(n,1,0.20)
    k <- 1:(n-4)
    y <- x[k+4]+x[k+3]+x[k+2]+x[k+1]+x[k]                    
    a <- min(c(which(y==5),Inf))
    b <- ifelse(a < n+1,1,0)
    b
  })
}
q6 <- wins6(10000,82)

#----------------------------- 7 WIN STREAKS ---------------------------#
#winning% = 80%
wins7 <- function(m,n)
{
  sapply(1:m, function(o)
  {
    x <- rbinom(n,1,0.80)
    k <- 1:(n-6)
    y <- x[k+6]+x[k+5]+x[k+4]+x[k+3]+x[k+2]+x[k+1]+x[k]                    
    a <- min(c(which(y==7),Inf))
    b <- ifelse(a < n+1,1,0)
    b
  })
}
q7 <- wins7(10000,82)
#winning% = 50%
wins8 <- function(m,n)
{
  sapply(1:m, function(o)
  {
    x <- rbinom(n,1,0.50)
    k <- 1:(n-6)
    y <- x[k+6]+x[k+5]+x[k+4]+x[k+3]+x[k+2]+x[k+1]+x[k]                    
    a <- min(c(which(y==7),Inf))
    b <- ifelse(a < n+1,1,0)
    b
  })
}
q8 <- wins8(10000,82)
#winning% = 20%
wins9 <- function(m,n)
{
  sapply(1:m, function(o)
  {
    x <- rbinom(n,1,0.20)
    k <- 1:(n-6)
    y <- x[k+6]+x[k+5]+x[k+4]+x[k+3]+x[k+2]+x[k+1]+x[k]                     
    a <- min(c(which(y==7),Inf))
    b <- ifelse(a < n+1,1,0)
    b
  })
}
q9 <- wins9(10000,82)

##___##
q1mean <- mean(q1)
q2mean <- mean(q2)
q3mean <- mean(q3)
q4mean <- mean(q4)
q5mean <- mean(q5)
q6mean <- mean(q6)
q7mean <- mean(q7)
q8mean <- mean(q8)
q9mean <- mean(q9)

prob.chart <- matrix(data=c(q1mean, q2mean, q3mean, q4mean, q5mean, q6mean, q7mean, 
                            q8mean, q9mean), 
                     ncol=3, byrow = TRUE)
row.names(prob.chart) <- c("3 Games winning streak", "5 Games winning streak", "7 Games winning streak")
colnames(prob.chart) <- c("80% win", "50% win", "20% win")
prob.chart
