help.start() #just in case #Iconify the help window and move on to the next part. #Check the working directory before importing else provide full path setwd("C:/Users/favero/Dropbox/exam/SPORTMAN/R/intro_SA") # load packages listofpackages = c("ellipse","reshape2","ggplot2","dygraphs", "dplyr") for (j in listofpackages){ if(sum(installed.packages()[, 1] == j) == 0) { install.packages(j) } library(j, character.only = T) } NBAdata=read.csv("C:/Users/favero/Dropbox/exam/SPORTMAN/R/data_L/Teams_overall.csv", header = T, stringsAsFactors = F, sep = ",") head(NBAdata) typeof(NBAdata)#to check the type of data #See which R objects are now in the R workspace. ls() # rm(x, y) #Remove objects no longer needed. (Clean up). #DATA TRANSFORMATION NBAdata <- na.omit(NBAdata) NBAdata$FGMISS=NBAdata$FGA-NBAdata$FG NBAdata$FTMISS=NBAdata$FTA-NBAdata$FT NBAdata$FGMISS=NBAdata$FGA-NBAdata$FG NBAdata$FTMISS=NBAdata$FTA-NBAdata$FT NBAdata$OFGMISS=NBAdata$OFGA-NBAdata$OFG NBAdata$OFTMISS=NBAdata$OFTA-NBAdata$OFT NBAdata$MISS=NBAdata$FGMISS+NBAdata$FTMISS NBAdata$OMISS=NBAdata$OFGMISS+NBAdata$OFTMISS NBAdata$W.=NBAdata$W/(NBAdata$W+NBAdata$L) #create employed possessions NBAdata$empl_poss=NBAdata$FGA + 0.44*NBAdata$FTA + NBAdata$TOV - NBAdata$ORB NBAdata$ptsxgame=NBAdata$PTS/NBAdata$G NBAdata$ptsxposs=NBAdata$PTS/NBAdata$empl_poss #DESCRIPTIVE STATISTICS ## ------------------------------------------------------------------------ summary(NBAdata[,"empl_poss"]) # this is very useful to get a grip on the data structure #QUESTION what is the average number of possessions per game mean(NBAdata[,"empl_poss"]) sd(NBAdata[,"empl_poss"]) var(NBAdata[,"empl_poss"]) hist(NBAdata[,"empl_poss"]) ## ------------------------------------------------------------------------ #SUBSETTING # extracting teams and years with more than 8000 points NBA00 = subset(NBAdata, PTS>8000) #different methodology for extraction with a shortcoming NBA01 = NBAdata[NBAdata$PTS>8000,] #not that the first method will drop NA the second will not NBA01 = na.omit(NBA01) #extracting data for a single team: Golden State and Boston Celtics Golden_State = subset(NBAdata, Team == "Golden State Warriors", select = Season:W.) Boston_Celtics= subset(NBAdata, Team == "Boston Celtics", select = Season:W.) #Correlation datacor00 = subset(NBAdata,select=c(W.,DRB,FGMISS,AST,PTS,TOV,PF,STL,FTMISS,ORB)) datacor00 = na.omit(datacor00) cor.datacor = cor(datacor00, use="complete.obs") cor.datacor # nice graphics presentation of correlations needs package ellipse ord = order(cor.datacor[1,]) ordered.cor.datacor = cor.datacor[ord, ord] plotcorr(ordered.cor.datacor, col=cm.colors(11)[5*ordered.cor.datacor + 6]) # even nicer correlation heatmap needs packages reshape2 ggplot2 cormat = round(cor(datacor00),2) head(cormat) melted_cormat = melt(cormat) head(melted_cormat) ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) + geom_tile() # Get lower triangle of the correlation matrix get_lower_tri=function(cormat){ cormat[upper.tri(cormat)] = NA return(cormat) } # Get upper triangle of the correlation matrix get_upper_tri = function(cormat){ cormat[lower.tri(cormat)] = NA return(cormat) } upper_tri = get_upper_tri(cormat) upper_tri # Melt the correlation matrix melted_cormat = melt(upper_tri, na.rm = TRUE) # Heatmap ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+ geom_tile(color = "white")+ scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1,1), space = "Lab", name="Pearson\nCorrelation") + theme_minimal()+ theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1))+ coord_fixed() #GRAPHICAL ANALYSIS #simple examples plot(y=NBAdata$W, x=NBAdata$DRB, col="red") plot(y=NBAdata$W, x=NBAdata$FG, col="blue") plot(y=NBAdata$W, x=NBAdata$AST, col="green") plot(y=NBAdata$W, x=NBAdata$PTS, col="red") plot(y=NBAdata$W, x=NBAdata$TOV, col="blue") plot(y=NBAdata$W, x=NBAdata$PF, col="green") plot(y=NBAdata$W, x=NBAdata$STL, col="red") plot(y=NBAdata$W, x=NBAdata$MFT, col="blue") plot(y=NBAdata$W, x=NBAdata$ORB, col="green") # cross plots # several plots on one canvas par(mfrow = c(1, 2)) plot(y=NBAdata$W, x=NBAdata$X2P,col="blue", main="r^2=0.571, P=0.008") #abline(reg = lm(NBAdata$W. ~ NBAdata$X2P ),col="red") plot(y=NBAdata$W, x=NBAdata$PTS,col="blue", main="r^2=0.612, P=0.004") par(mfrow = c(1, 1)) # alternative solution that looks better ggplot(data=NBAdata, aes(NBAdata$W.,NBAdata$X2P))+geom_point(colour = 'red', size = 3) # creating a TS plot plot(y = Golden_State$W., x = Golden_State$Season, type = "l",col = "blue",ylim = c(0,1),ylab = "Percent WINS.GSW", xlab = "Season", main = "Performance over time of GSW") lines(y = rep(mean(Golden_State$W.), length(Golden_State$W.)), x = Golden_State$Season, col = "red") lines(y = rep(mean(Golden_State$W.) + 2*sd(Golden_State$W.), length(Golden_State$W.)), x = Golden_State$Season, col = "red", lty = 2) lines(y = rep(mean(Golden_State$W.) - 2*sd(Golden_State$W.), length(Golden_State$W.)), x = Golden_State$Season, col = "red", lty = 2) # plot several plots on one canvas par(mfrow = c(1, 2)) plot(y = Golden_State$W., x = Golden_State$Season, type = "l",col = "blue",ylim = c(0,1),ylab = "Percent WINS.GSW", xlab = "Season", main = "Performance over time of GSW") lines(y = rep(mean(Golden_State$W.), length(Golden_State$W.)), x = Golden_State$Season, col = "red") lines(y = rep(mean(Golden_State$W.) + 2*sd(Golden_State$W.), length(Golden_State$W.)), x = Golden_State$Season, col = "red", lty = 2) lines(y = rep(mean(Golden_State$W.) - 2*sd(Golden_State$W.), length(Golden_State$W.)), x = Golden_State$Season, col = "red", lty = 2) plot(y = Boston_Celtics$W., x = Boston_Celtics$Season, type = "l",col = "blue",ylim = c(0,1),ylab = "Percent WINS.GSW", xlab = "Season", main = "Performance over time of BC") lines(y = rep(mean(Boston_Celtics$W.), length(Boston_Celtics$W.)), x = Boston_Celtics$Season, col = "red") lines(y = rep(mean(Boston_Celtics$W.) + 2*sd(Boston_Celtics$W.), length(Boston_Celtics$W.)), x = Boston_Celtics$Season, col = "red", lty = 2) lines(y = rep(mean(Boston_Celtics$W.) - 2*sd(Boston_Celtics$W.), length(Boston_Celtics$W.)), x = Boston_Celtics$Season, col = "red", lty = 2) par(mfrow = c(1, 1)) #TIME SERIES plotdata <- ts(Golden_State, start = 1980, frequency = 1) start <- 1980 end <- 2018 s1 <- window(plotdata[, "W."], start = start, end = end) plot(s1, type = "l", main = "GSW PERFORMANCE", ylab = "WINS") dygraph(s1, main = "GSW PErformance", ylab = "WINS") # HISTOGRAMS AND QQ PLOT ## ------------------------------------------------------------------------ hist(s1, breaks = seq(min(s1), max(s1), l = 20+1),prob=TRUE, main = "histogram of wins") curve(dnorm(x,mean=mean(s1),sd=sd(s1)),col='darkblue',lwd=2,add=TRUE) #hist(s1, breaks = seq(0, 1, l = 20+1),prob=TRUE, main = "histogram of wins") #curve(dnorm(x,mean=mean(s1),sd=sd(s1)),col='darkblue',lwd=2,add=TRUE) ## ------------------------------------------------------------------------ qqplot(window(plotdata[, "OTOV"], start = start, end = end), window(plotdata[, "TOV"], start = start, end = end), ylim = c(1000,1800), xlim = c(1000,1800), ylab = "OPP TOV", xlab = "TOV", main = "Quantile-Quantile plot (Q-Q plot)") mod5 <- lm(window(plotdata[, "TOV"], start = start, end = end) ~ -1 + window(plotdata[, "OTOV"], start = start, end = end)) abline(reg = mod5, col = "red") # qq-plot versus normal dist qqnorm(window(plotdata[, "TOV"], start = start, end = end), ylim = c(1000,1800),ylab = "TURNOVER. sample quantile", xlab = "TURNOVER. theoretical quantiles", main = "Normal (Q-Q plot)") qqline(window(plotdata[, "TOV"], start = start, end = end), datax = FALSE, distribution = qnorm, probs = c(0.25, 0.75), qtype = 7) ## last step: saving the dataset! write.csv(x = NBAdata, file = "data/NBAdata.csv") ## MODEL EXAMPLE loss <- function(m,n) { sapply(1:m, function(o) { x <- rbinom(n,1,0.85) k <- 1:(n-1) y <- x[k+1]+x[k] # the vector of successive sums a <- min(c(which(y==0),Inf)) # is finite if & only if an element of y is 0 b <- ifelse(a < n+1,1,0) b }) } summary(loss(10000,82)) #the above model could be used to compute the probability of a winning streak of k games