#------------------------
# QUESTION 1 
#------------------------
#clear the environment 
rm(list=ls()) 
## ------------------------------------------------------------------------
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
## -------

## Load packages
listofpackages <- c("ellipse", "reshape2", "ggplot2", "dygraphs", "plyr", "dplyr","forecast", "aod")
for (pkg in listofpackages) {
  if (!require(pkg, character.only = TRUE)) {
    install.packages(pkg)
    library(pkg, character.only = TRUE)
  }
}
## Read data using readr package
library(readr)
data <- read_csv("olympics_100m1.csv")

head(data)
typeof(data)#to check the type of data
data <- na.omit(data)
data=subset(data,Year > 1924)

#------------------------
# QUESTION 2 
#------------------------
count.wins = aggregate(list(Wins=data$Gender), 
                         by=list(Country=data$Country,Gender=data$Gender),
                         FUN=length)
count.wins
#------------------------
# QUESTION 3 
#------------------------
# 
p <- ggplot(data, aes(Year, Time)) + 
  geom_point(aes(color = factor(Gender)), size = 3) +
  geom_text(aes(label = Country), hjust = 0, vjust = 0, color = "black",size = 3) +
  geom_smooth(data = subset(data, Gender == 1), aes(Year, Time), color = "red") +
  geom_smooth(data = subset(data, Gender == 0), aes(Year, Time), color = "blue")

p + scale_color_manual(values = c("red", "blue"))
p + geom_smooth(method = lm,colour="green")

#------------------------
# QUESTION 4 
#------------------------
model_pool<- lm(data =data,Time~Year)
summary(model_pool)
data$fitted_pool<-model_pool$fitted.values

model_panel<- lm(data =data,Time~Year+Gender)
summary(model_panel)
data$fitted_panel<-model_panel$fitted.values

model_reg_m<- lm(data =subset(data, Gender == 1),Time~Year)
summary(model_reg_m)

model_reg_w<- lm(data =subset(data, Gender == 0),Time~Year)
summary(model_reg_w)


#------------------------
# QUESTION 5
#------------------------

###graphs with ggplot 
p1 <- ggplot(data, aes(Year, Time)) + 
  geom_point(aes(color = factor(Gender)), size = 3) +
  geom_text(aes(label = Country), hjust = 0, vjust = 0, color = "black",size = 3)+
  scale_color_manual(values = c("red", "blue"))+
 geom_point(aes(Year,fitted_pool), size = 2,colour="green")+
 geom_point(aes(Year,fitted_panel), size = 2,colour="black")+
 geom_point(aes(Year,fitted_2m), size = 2,colour="orange")

p1


#------------------------
# QUESTION 6
#------------------------
# cluster analysis 


