rm(list=ls())
#Check the working directory before importing else provide full path
#setwd(path)
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

# packages used
listofpackages <- c("dygraphs", "tidyverse","ellipse","reshape2","highcharter","xts","xlsx","readxl","quantmod")

for (j in listofpackages){
  if(sum(installed.packages()[, 1] == j) == 0) {
    install.packages(j)
  }
  library(j, character.only = T)
}

raw_data           = read_xlsx("../data/2023_monthly_stocks.xlsx") 
names(raw_data)[1] = 'Date'
typeof(raw_data)
typeof(raw_data$Date)
typeof(raw_data$AXP)
typeof(raw_data$CSCO)

dates <-seq(as.Date("1985-02-01"),length=462, by="months")
params <- c("Date","AXP","AMGN","AAPL","BA","CAT","CSCO", "DJI")
data <- raw_data[, c(params)]
data<- na.omit(data)
data <- data %>% 
  mutate(Date = as.Date(Date, format = "%Y-%m-%d"))

params1 <- c("AXP","AMGN","AAPL","BA","CAT","CSCO", "DJI")
tsdata <- xts(raw_data[, c(params1)], order.by=dates) # creates a time series object
tsdata <- na.omit(tsdata) # omitting the rows with NA presence
data<- na.omit(data)
## having created the database with all observation we generate a subset 
#tsdata1 <- tsdata["1992-02-01/1993-02-01"]
#data=subset(data,select=c(1:12))

## --------------------
# DATA TRANSFORMATIONS 
## --------------------
#1. from prices to returns
# exact monthly returns 
t1<-nrow(data)
data$AXP_ret <- data$AMGN_ret <- array(data = NA, dim = t1)
for (i in 2:t1) {
  data[i, "AMGN_ret"][[1]]=(data[i, "AMGN"][[1]]-data[i-1, "AMGN"][[1]])/data[i-1, "AMGN"][[1]]
  data[i, "AXP_ret"][[1]]=(data[i, "AXP"][[1]]-data[i-1, "AXP"][[1]])/data[i-1, "AXP"][[1]]
}

# the following lines of R apply the same transfromation to 
# two series AXP and AMGN available in .xts format in a frame called tsdata. 
# Could you do the same transformation in a more parsimonious way  by having
# a loop over the serie names  AXP and AMGN ?

series_names <- c("AAPL","BA","CAT","CSCO","DJI")

for (name in series_names) {
  return_col_name <- paste0(name, "_ret")
  data[, return_col_name] <- array(data = NA, dim = t1)
  for (i in 2:nrow(data)) {
    data[i, return_col_name][[1]] <- (data[i, name][[1]] - data[i - 1, name][[1]]) / data[i - 1, name][[1]]
  } 
}

# same in .xts 
t1<-nrow(tsdata)

tsdata$AXP_ret <- tsdata$AMGN_ret<- tsdata$AAPL_ret<- tsdata$BA_ret<- array(data = NA, dim = t1)
tsdata$CAT_ret<-tsdata$CSCO_ret <- tsdata$DJI_ret<- array(data = NA, dim = t1)
for (i in 2:t1) {
  tsdata[i, "AMGN_ret"][[1]]=(tsdata[i, "AMGN"][[1]]-tsdata[i-1, "AMGN"][[1]])/data[i-1, "AMGN"][[1]]
  tsdata[i, "AXP_ret"][[1]]=(tsdata[i, "AXP"][[1]]-tsdata[i-1, "AXP"][[1]])/data[i-1, "AXP"][[1]]
  #   tsdata[i, "AAPL_ret"][[1]]=(tsdata[i, "AAPL"][[1]]-tsdata[i-1, "AAPL"][[1]])/data[i-1, "AAPL"][[1]]
  #   tsdata[i, "BA_ret"][[1]]=(tsdata[i, "BA"][[1]]-tsdata[i-1, "BA"][[1]])/data[i-1, "BA"][[1]]
  #   tsdata[i, "CAT_ret"][[1]]=(tsdata[i, "CAT"][[1]]-tsdata[i-1, "CAT"][[1]])/data[i-1, "CAT"][[1]]
  #   tsdata[i, "CSCO_ret"][[1]]=(tsdata[i, "CSCO"][[1]]-tsdata[i-1, "CSCO"][[1]])/data[i-1, "CSCO"][[1]]
  #   tsdata[i, "DJI_ret"][[1]]=(tsdata[i, "DJI"][[1]]-tsdata[i-1, "DJI"][[1]])/data[i-1, "DJI"][[1]]
}

# the loop is a bit different in .xts  
series_names <- c("AAPL","BA","CAT","CSCO","DJI")

for (name in series_names) {
  return_col_name <- paste0(name, "_ret")
  temporary_column  <- array(data = NA, dim = t1)
  
  tsdata <- merge.xts(tsdata, temporary_column) # add last column
  colnames(tsdata)[ncol(tsdata)] = return_col_name # rename it  
  
  for (i in 2:nrow(data)) {
    tsdata[i, return_col_name] <- (tsdata[i, name][[1]] - tsdata[i - 1, name][[1]]) / tsdata[i - 1, name][[1]]
  }
}

# buy and hold returns
## what would happen had we invested $1 in the DJI and AXP at t0
## initializing values
data$DJI_cum <- data$AXP_cum <- array(data = NA, dim = nrow(data))

data[1, c("DJI_cum", "AXP_cum")] <- 1 
t1<-nrow(data)
for (i in 2:t1) {
  data[i, "DJI_cum"][[1]]=data[i-1, "DJI_cum"][[1]]*(1+data[i, "DJI_ret"][[1]])
  data[i, "AXP_cum"][[1]]=data[i-1, "AXP_cum"][[1]]*(1+data[i, "AXP_ret"][[1]])
}


tsdata$DJI_cum <- array(data = NA, dim = nrow(tsdata))
tsdata$AXP_cum <- array(data = NA, dim = nrow(tsdata))
tsdata[1, c("DJI_cum", "AXP_cum")] <- 1 
t1<-nrow(data)
for (i in 2:t1) {
  tsdata[i, "DJI_cum"][[1]]=tsdata[i-1, "DJI_cum"][[1]]*(1+tsdata[i, "DJI_ret"][[1]])
  tsdata[i, "AXP_cum"][[1]]=tsdata[i-1, "AXP_cum"][[1]]*(1+tsdata[i, "AXP_ret"][[1]])
}

## -------------------------
# monthly log stock returns
## -------------------------
data$DJI_lp<-log(data$DJI_cum)
data$AXP_lp<-log(data$AXP_cum)
data$DJI_lret <- c(NA,diff(data$DJI_lp))
data$AXP_lret <- c(NA,diff(data$AXP_lp))
# value of a buy-and-hold portfolio using cumulative log returns 
data$DJI_cuml  <- array(data = NA, dim = nrow(data))
data[1, c("DJI_cuml")]  <- 1 
for (i in 2:t1) {
  data[i, "DJI_cuml"][[1]]=data[i-1, "DJI_cuml"][[1]]*(1+data[i, "DJI_lret"][[1]])
      }

tsdata$DJI_lp<-log(tsdata$DJI_cum)
tsdata$AXP_lp<-log(tsdata$AXP_cum)
tsdata$DJI_lret <- diff(tsdata$DJI_lp)
tsdata$AXP_lret <- diff(tsdata$AXP_lp)
tsdata$DJI_cuml  <- array(data = NA, dim = nrow(tsdata))
tsdata[1, c("DJI_cuml")]  <- 1 
for (i in 2:nrow(tsdata)) {
  tsdata[i, "DJI_cuml"][[1]]=tsdata[i-1, "DJI_cuml"][[1]]*(1+tsdata[i, "DJI_lret"][[1]])
}

tsdata.df <- as.data.frame(tsdata)
save(data, file='data.Rdata')
save(tsdata, file='tsdata.Rdata')
save(tsdata.df, file='tsdata.df.Rdata')

## ---------------------------
# time-series plots 
#-----------------------------
#(1) plot .xts series
plot(tsdata$DJI_ret, col = "blue", lwd = 2, main = "", ylab = "")
lines(tsdata$AXP_ret, col = "green", lwd = 2)
addLegend("topleft",
          legend.names = c("DJI", "AXP"),
          lty = c(1, 1), lwd = c(2, 2),
          col = c("blue", "green"))
dev.copy2pdf(width = 5.72, out.type = "pdf",file="Figure_1xts.pdf")
dev.off()
#(2) use highchart with .xts series
highchart(type = "stock") %>%
  hc_title(text = "Monthly Log Returns") %>%
  hc_add_series(data=tsdata[, "DJI_ret"], name = "DJI_ret")%>%
  hc_add_series(data=tsdata[, "AXP_ret"], name = "AXP_ret")%>%
  hc_add_theme(hc_theme_flat()) %>%
  hc_navigator(enabled = FALSE) %>%
  hc_scrollbar(enabled = FALSE) %>%
  hc_exporting(enabled = TRUE) %>%
  hc_legend(enabled = TRUE)

#(2) use ggplot with the standard dataframe 
plot <- ggplot(data, aes(x = Date)) +
  geom_point(aes(y = DJI_ret, color = "DJI"), size = 2) +
  geom_point(aes(y = AXP_ret, color = "AXP"), size = 2) +
  labs(title = "Returns",
       x = "Time", y = " Value") +
  scale_color_manual(values = c("DJI" = "red", "AXP" = "blue")) +
  theme_minimal() +
  theme(axis.line = element_line(color = "black")) #+

print(plot)
ggsave(filename = "Figure_1.pdf", plot = plot, device = "pdf",width = 5.72,height=3.12)
# dev.copy2pdf(width = 4, out.type = "pdf",file="Figure_1.pdf")
# dev.off()

## ---------------------------
# comparing returns and log-returns 
#-----------------------------

plot(tsdata$DJI_ret, ylab = "Returns", main = "S&P500 ", col = "blue", lwd = 2)
lines(tsdata$DJI_lret, col = "red")

# time-series plot of cumulative returns
plot(tsdata$DJI_cum,
     type = "l", col = "red", ylim = c(0, 15),
     ylab = "cumulative return mkt")
lines(tsdata$DJI_cuml, col = "blue",type = "l",ylab = "cumulative log return mkt")

# cross-plot of exact and log-linearized returns 
plot(x=data$DJI_ret, y=data$DJI_lret, col="red")
lines(x=data$DJI_ret, y=data$DJI_ret,col = "blue")

#cross-plot of returns of AXP and their value predicted from the market 
fm1 <- lm(AXP_ret ~ DJI_ret, data=data)
summary(fm1)
data$AXP_retfit<-c(NA,fitted(fm1))
plot(x=data$DJI_ret, y=data$AXP_ret, col="red")
lines(x=data$DJI_ret, y=data$AXP_retfit,col = "blue")

plotactfit <- ggplot(data, aes(x = DJI_ret, y = AXP_ret)) +
  geom_point(color = "red") +
  geom_line(aes(x = DJI_ret, y = AXP_retfit), color = "blue") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "black")   # Adding the zero line
  
  # Display the plot
  print(plotactfit)
 
  
  #--------------------
  #plotting prices
  #--------------------
  sfDJI<- as.numeric(tsdata$DJI[1])
  sfAXP<- as.numeric(tsdata$AXP[1])
  plot(tsdata$DJI/sfDJI,col = "blue",lwd = 2)
  lines(tsdata$AXP/sfAXP, col = "green",lwd = 2)
  addLegend("topleft", on=1, 
            legend.names = c("DJIrs", "AXPrs"), 
            lty=c(1, 1), lwd=c(2, 1),
            col=c("blue", "green", "red"))
  
  # you can interact with Chat GPT to improve on this version of the graphs 
  
  #First Question When I run the following sequence in R I get a graph with tsdata$DJI
  #written at the top left of it. How do I remove this from the graph ?
  #ANSWER 
  
  plot(tsdata$DJI/3267.70, col = "blue", lwd = 2, main = "", ylab = "")
  lines(tsdata$AXP/3.277914, col = "green", lwd = 2)
  addLegend("topleft",
            legend.names = c("DJI", "AXPrs"),
            lty = c(1, 1), lwd = c(2, 2),
            col = c("blue", "green"))  # Remove "red" from col argument
  
  #Second Question> I would like to have the same graph in a double scale
  # with DJI on the left hand scale and AXP on the right hand scale 
  combined_data <- data.frame(DJI = tsdata$DJI, AXP = tsdata$AXP )
  dygraph(combined_data, main = "Double-Scale Time Series Graph") %>%
    dySeries("DJI", label = "DJI", color = "blue") %>%
    dySeries("AXP", label = "AXP", color = "green", axis = "y2") %>%
    dyAxis("y", label = "DJI") %>%
    dyAxis("y2", label = "AXP", independentTicks = TRUE) %>%
    dyLegend(width = 250)
  
  
  
  #--------------------
  #plotting series from a list using GGPLOT 
  #--------------------
  plot <- ggplot(data, aes(x = Date)) +
    geom_line(aes(y = DJI/3267.70, color = "DJI"), size = 2) +
    geom_line(aes(y = AXP/3.277914, color = "AXP"), size = 2) +
    labs(title = "Trends",
         x = "Time", y = " Value") +
    scale_color_manual(values = c("DJI" = "red", "AXP" = "blue")) +
    theme_minimal() +
    theme(axis.line = element_line(color = "black")) +
    scale_x_continuous(breaks = data$Date, labels = data$Date)  # Add this line for x-axis labels
  
  # Print the plot
  print(plot)
  
  #Ask Chat GPT: When I run the following code in R I get "too many " labels on the x axis,
  #how can I reduce the number of labels (say one every 5 years) ?
  #Answer 
  #To reduce the number of x-axis labels in your ggplot, you can use the scale_x_date() function
  #with the date_breaks argument to specify the intervals at which you want the labels to appear.
  #In your case, you want to display labels every 5 years. Here's how you can modify your code to achieve this:
  ggplot(data, aes(x = Date)) +
    geom_line(aes(y = DJI/3267.70, color = "DJI"), size = 1) +
    geom_line(aes(y = AXP/3.277914, color = "AXP"), size = 1) +
    labs(title = "Trends",
         x = "Time", y = " Value") +
    scale_color_manual(values = c("DJI" = "red", "AXP" = "blue")) +
    theme_minimal() +
    theme(axis.line = element_line(color = "black")) +
    scale_x_date(date_breaks = "5 years", date_labels = "%Y")
  #In the code above:
  #scale_x_date() is used to control the x-axis (date) scale.
  #date_breaks = "5 years" specifies that you want to display labels every 5 years.
  #date_labels = "%Y" specifies the date format you want to use for the labels (in this case, the year only).
  #This should result in a plot with x-axis labels appearing every 5 years, making the plot more readable when you have a large time series dataset. Adjust the date_breaks argument as needed to control the spacing of the labels according to your preferences.
  
  
  
  ## ----------------------------------------------------------
  #  combine  several plots on one canvas
  ## ----------------------------------------------------------
  
  par(mfrow = c(2, 2))
  
  plot(tsdata$DJI_ret, ylab = "Returns", main = "DJ30 ", col = "blue", lwd = 2)
  
  plot(x=data$DJI_ret, y=data$DJI_lret, col="red")
  lines(x=data$DJI_ret, y=data$DJI_ret,col = "blue")
  
  plot(x=data$AXP_ret, y=data$AXP_lret, col="red",ylim = c(-0.5, 1))
  lines(x=data$AXP_ret, y=data$AXP_ret,col = "blue")
  
  plot(tsdata$DJI_cum,
       type = "l", col = "red", ylim = c(0, 12),main = "DJ30 ",
       ylab = "cumulative return mkt")
  
  par(mfrow = c(1, 1))
  
  
  ## -------------------------
  #  HISTOGRAMS AND QQ PLOTS 
  ## -------------------------
  
  ## Histograms
  s1 <- na.omit(tsdata$DJI_ret)
  hist(s1, breaks = seq(min(s1), max(s1), l = 20+1),prob=TRUE, main = "histogram of monthly returns")
  curve(dnorm(x,mean=mean(s1),sd=sd(s1)),col='darkblue',lwd=2,add=TRUE)
  
  ## Histograms with Highcharter using .xts data 
  
  hc_hist <- hist(coredata(tsdata$DJI_lret), breaks = 50, plot = FALSE)
  hchart(hc_hist, color = "cornflowerblue")%>%
    hc_title(text =
               paste("DJI",
                     "Log Returns Distribution",
                     sep = " ")) %>%
    hc_add_theme(hc_theme_flat()) %>%
    hc_exporting(enabled = TRUE) %>%
    hc_legend(enabled = FALSE)
  
  hc_hist <- hist(tsdata[, "DJI_lret"], breaks = 50, plot = FALSE)
  hchart(hc_hist, color = "cornflowerblue")%>%
    hc_title(text =
               paste("DJI",
                     "Log Returns Distribution",
                     sep = " ")) %>%
    hc_add_theme(hc_theme_flat()) %>%
    hc_exporting(enabled = TRUE) %>%
    hc_legend(enabled = FALSE)
  ## ------------------------
  
  qqplot(tsdata.df$DJI_ret,
         tsdata.df$DJI_lret,
         ylim = c(-0.15,0.15), xlim = c(-0.15,0.15),
         ylab = "monthly return. log approximation",
         xlab = "monthly return. exact computation",
         main = "Quantile-Quantile plot (Q-Q plot)")
  mod5 <- lm(tsdata.df$DJI_ret ~ tsdata.df$DJI_lret)
  abline(reg = mod5, col = "red")
  
  # qq-plot versus normal dist 
  qqnorm(tsdata$DJI_ret,
         ylim = c(-0.15,0.15),ylab = "monthly return. sample quantile",
         xlab = "monthly return. theoretical quantiles",
         main = "Normal (Q-Q plot)")
  qqline(tsdata$DJI_ret, datax = FALSE, distribution = qnorm,
         probs = c(0.25, 0.75), qtype = 7)
  
  ## ---------------------
  #  CORRELATION ANALYSIS  
  ## ---------------------
  tsdata.df <- as.data.frame(tsdata)
  # Select specific columns and observations from the start date onward
  selected_cols <- c("AMGN_ret", "AXP_ret", "AAPL_ret", "BA_ret", "CAT_ret", "CSCO_ret", "DJI_ret")
  datashow <- subset(tsdata.df[, selected_cols])
  datashow<-na.omit(datashow)
    # Print the resulting subset
  summary(datashow) # this is very useful to get a grip on the data structure
  mean(datashow[,"AMGN_ret"])
  sd(datashow[,"AMGN_ret"])
  var(datashow[,"AMGN_ret"])
  cor(datashow)
  cor.datacor = cor(datashow, use="complete.obs")
  cor.datacor
  
  ## ---------------------
  ord <- order(cor.datacor[1,])
  ordered.cor.datacor <- cor.datacor[ord, ord]
  plotcorr(ordered.cor.datacor, col=cm.colors(11)[5*ordered.cor.datacor + 6])
  
  
  ## ---------------------
  cormat <- round(cor(datashow),2)
  head(cormat)
  melted_cormat <- melt(cormat)
  head(melted_cormat)
  ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) + 
    geom_tile()
  # Get lower triangle of the correlation matrix
  get_lower_tri<-function(cormat){
    cormat[upper.tri(cormat)] <- NA
    return(cormat)
  }
  # Get upper triangle of the correlation matrix
  get_upper_tri <- function(cormat){
    cormat[lower.tri(cormat)]<- NA
    return(cormat)
  }
  upper_tri <- get_upper_tri(cormat)
  upper_tri
  # Melt the correlation matrix
  melted_cormat <- melt(upper_tri, na.rm = TRUE)
  # Heatmap
  ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+
    geom_tile(color = "white")+
    scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                         midpoint = 0, limit = c(-1,1), space = "Lab", 
                         name="Pearson\nCorrelation") +
    theme_minimal()+ 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                     size = 12, hjust = 1))+
    coord_fixed()
 