help.start()
#just in case 

#clear the environment 
rm(list=ls()) 
## ------------------------------------------------------------------------
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
## -------

# load packages 
listofpackages = c("ellipse","reshape2","ggplot2","dygraphs","plyr", "dplyr")

for (j in listofpackages){
  if(sum(installed.packages()[, 1] == j) == 0) {
    install.packages(j)
  }
  library(j, character.only = T)
}
#original data in data_L
NBAdata=read.csv("Teams_overall2020.csv", header = T, stringsAsFactors = F, sep = ",")
head(NBAdata)
typeof(NBAdata)#to check the type of data

#See which R objects are now in the R workspace.
ls()


#DATA TRANSFORMATION 
NBAdata <- na.omit(NBAdata)
NBAdata$FGMISS=NBAdata$FGA-NBAdata$FG
NBAdata$FTMISS=NBAdata$FTA-NBAdata$FT
NBAdata$OFGMISS=NBAdata$OFGA-NBAdata$OFG
NBAdata$OFTMISS=NBAdata$OFTA-NBAdata$OFT
NBAdata$MISS=NBAdata$FGMISS+NBAdata$FTMISS
NBAdata$OMISS=NBAdata$OFGMISS+NBAdata$OFTMISS
NBAdata$W.=NBAdata$W/(NBAdata$W+NBAdata$L)
#create employed possessions 
NBAdata$empl_poss=NBAdata$FGA + 0.44*NBAdata$FTA + NBAdata$TOV - NBAdata$ORB
NBAdata$pace=NBAdata$empl_poss/(48*82)
NBAdata$ptsxgame=NBAdata$PTS/NBAdata$G
NBAdata$ptsxposs=NBAdata$PTS/NBAdata$empl_poss
#DESCRIPTIVE STATISTICS 
## ------------------------------------------------------------------------
summary(NBAdata[,"empl_poss"]) # this is very useful to get a grip on the data structure
#QUESTION what is the average number of possessions per game 
mean(NBAdata[,"empl_poss"])
sd(NBAdata[,"empl_poss"])
var(NBAdata[,"empl_poss"])

hist(NBAdata[,"empl_poss"])
## ------------------------------------------------------------------------
#SUBSETTING
# extracting teams and years with more than 8000 points
NBA00 = subset(NBAdata, PTS>8000)
#different methodology for extraction with a shortcoming
NBA01 = NBAdata[NBAdata$PTS>8000,]
#not that the first method will drop NA the second will not 
NBA01 = na.omit(NBA01)

#extracting data for a single team: Golden State and Boston Celtics
Golden_State = subset(NBAdata, Team == "Golden State Warriors", select = Season:W.)
Boston_Celtics= subset(NBAdata, Team == "Boston Celtics", select = Season:W.)


### data grouping 
# using ddply() function (from plyr package) 

TEAMSxSEAS = ddply(NBAdata, c("Season"), summarise, TEAMS = length(Team))
plot(TEAMSxSEAS)

count.seas_team = aggregate(list(Season=NBAdata$Season), 
                         by=list(Team=NBAdata$Team),
                         FUN=length)

count.team_seas = aggregate(list(Team=NBAdata$Team), 
                            by=list(Season=NBAdata$Season),
                            FUN=length)
#Correlation

datacor00 = subset(NBAdata,select=c(W.,DRB,FGMISS,AST,PTS,TOV,PF,STL,FTMISS,ORB))
datacor00 = na.omit(datacor00)
cor.datacor = cor(datacor00, use="complete.obs")
cor.datacor

# nice graphics presentation of correlations needs package ellipse 
ord = order(cor.datacor[1,])
ordered.cor.datacor = cor.datacor[ord, ord]
plotcorr(ordered.cor.datacor, col=cm.colors(11)[5*ordered.cor.datacor + 6])

# even nicer correlation heatmap  needs packages reshape2 ggplot2
cormat = round(cor(datacor00),2)
head(cormat)
melted_cormat = melt(cormat)
head(melted_cormat)
ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) + 
  geom_tile()
# Get lower triangle of the correlation matrix
get_lower_tri=function(cormat){
  cormat[upper.tri(cormat)] = NA
  return(cormat)
}
# Get upper triangle of the correlation matrix
get_upper_tri = function(cormat){
  cormat[lower.tri(cormat)] = NA
  return(cormat)
}
upper_tri = get_upper_tri(cormat)
upper_tri
# Melt the correlation matrix
melted_cormat = melt(upper_tri, na.rm = TRUE)
# Heatmap
ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+
  geom_tile(color = "white")+
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name="Pearson\nCorrelation") +
  theme_minimal()+ 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                   size = 12, hjust = 1))+
  coord_fixed()

#GRAPHICAL ANALYSIS 
#simple examples 
plot(y=NBAdata$W, x=NBAdata$DRB, col="red")
plot(y=NBAdata$W, x=NBAdata$FG, col="blue")
plot(y=NBAdata$W, x=NBAdata$AST, col="green")
plot(y=NBAdata$W, x=NBAdata$PTS, col="red")
plot(y=NBAdata$W, x=NBAdata$TOV, col="blue")
plot(y=NBAdata$W, x=NBAdata$PF, col="green")
plot(y=NBAdata$W, x=NBAdata$STL, col="red")
plot(y=NBAdata$W, x=NBAdata$MFT, col="blue")
plot(y=NBAdata$W, x=NBAdata$ORB, col="green")
# cross plots 
# several plots on one canvas
par(mfrow = c(1, 2))
plot(y=NBAdata$W, x=NBAdata$X2P,col="blue", main="r^2=0.571, P=0.008")
#abline(reg = lm(NBAdata$W. ~ NBAdata$X2P ),col="red")
plot(y=NBAdata$W, x=NBAdata$PTS,col="blue", main="r^2=0.612, P=0.004")

par(mfrow = c(1, 1))

# alternative solution that looks better 
ggplot(data=NBAdata, aes(NBAdata$W.,NBAdata$X2P))+geom_point(colour = 'red', size = 3)
# creating a TS plot 
plot(y = Golden_State$W., x = Golden_State$Season, type = "l",col = "blue",ylim = c(0,1),ylab = "Percent WINS.GSW",
     xlab = "Season",
     main = "Performance over time of GSW")
lines(y = rep(mean(Golden_State$W.), length(Golden_State$W.)), x = Golden_State$Season, col = "red")
lines(y = rep(mean(Golden_State$W.) + 2*sd(Golden_State$W.), length(Golden_State$W.)), x = Golden_State$Season, col = "red", lty = 2)
lines(y = rep(mean(Golden_State$W.) - 2*sd(Golden_State$W.), length(Golden_State$W.)), x = Golden_State$Season, col = "red", lty = 2)

# plot several plots on one canvas
par(mfrow = c(1, 2))
plot(y = Golden_State$W., x = Golden_State$Season, type = "l",col = "blue",ylim = c(0,1),ylab = "Percent WINS.GSW",
     xlab = "Season",
     main = "Performance over time of GSW")
lines(y = rep(mean(Golden_State$W.), length(Golden_State$W.)), x = Golden_State$Season, col = "red")
lines(y = rep(mean(Golden_State$W.) + 2*sd(Golden_State$W.), length(Golden_State$W.)), x = Golden_State$Season, col = "red", lty = 2)
lines(y = rep(mean(Golden_State$W.) - 2*sd(Golden_State$W.), length(Golden_State$W.)), x = Golden_State$Season, col = "red", lty = 2)

plot(y = Boston_Celtics$W., x = Boston_Celtics$Season, type = "l",col = "blue",ylim = c(0,1),ylab = "Percent WINS.GSW",
     xlab = "Season",
     main = "Performance over time of BC")
lines(y = rep(mean(Boston_Celtics$W.), length(Boston_Celtics$W.)), x = Boston_Celtics$Season, col = "red")
lines(y = rep(mean(Boston_Celtics$W.) + 2*sd(Boston_Celtics$W.), length(Boston_Celtics$W.)), x = Boston_Celtics$Season, col = "red", lty = 2)
lines(y = rep(mean(Boston_Celtics$W.) - 2*sd(Boston_Celtics$W.), length(Boston_Celtics$W.)), x = Boston_Celtics$Season, col = "red", lty = 2)

par(mfrow = c(1, 1))

#TIME SERIES 
plotdata <- ts(Golden_State, start = 1980, frequency = 1)
start <- 1980
end <- 2018
s1 <- window(plotdata[, "W."], start = start, end = end)
plot(s1, type = "l", main = "GSW PERFORMANCE", ylab = "WINS")
dygraph(s1, main = "GSW PErformance", ylab = "WINS")

# HISTOGRAMS AND QQ PLOT 
## ------------------------------------------------------------------------

hist(s1, breaks = seq(min(s1), max(s1), l = 20+1),prob=TRUE, main = "histogram of wins")
curve(dnorm(x,mean=mean(s1),sd=sd(s1)),col='darkblue',lwd=2,add=TRUE)
#hist(s1, breaks = seq(0, 1, l = 20+1),prob=TRUE, main = "histogram of wins")
#curve(dnorm(x,mean=mean(s1),sd=sd(s1)),col='darkblue',lwd=2,add=TRUE)
## ------------------------------------------------------------------------

qqplot(window(plotdata[, "OTOV"], start = start, end = end),
       window(plotdata[, "TOV"], start = start, end = end),
       ylim = c(1000,1800), xlim = c(1000,1800),
       ylab = "OPP TOV",
       xlab = "TOV",
       main = "Quantile-Quantile plot (Q-Q plot)")
mod5 <- lm(window(plotdata[, "TOV"], start = start, end = end) ~ -1 + window(plotdata[, "OTOV"], start = start, end = end))
abline(reg = mod5, col = "red")

# qq-plot versus normal dist 
qqnorm(window(plotdata[, "TOV"], start = start, end = end),
       ylim = c(1000,1800),ylab = "TURNOVER. sample quantile",
       xlab = "TURNOVER. theoretical quantiles",
       main = "Normal (Q-Q plot)")
qqline(window(plotdata[, "TOV"], start = start, end = end), datax = FALSE, distribution = qnorm,
       probs = c(0.25, 0.75), qtype = 7)

## last step: saving the dataset!
# write.csv(x = NBAdata, file = "data/NBAdata.csv")