---
title: "Data import and transformation. Shiller database"
output:
  html_document:
    toc: yes
  html_notebook: default
  pdf_document:
    toc: yes
  word_document:
    toc: yes
---
```{r, include=FALSE}
# packages used
listofpackages <- c("dygraphs", "dplyr","ellipse","reshape2","ggplot2")

for (j in listofpackages){
  if(sum(installed.packages()[, 1] == j) == 0) {
    install.packages(j)
  }
  library(j, character.only = T)
}


# additional convenient functions
persum <- function(x, k){
  # works for a unidimensional time series object and normal vectors
  if (is.ts(x) == T){
    # setting the frames of a new series
    frequency <- frequency(x)
    shift <- c((k-1) %/% frequency, (k-1) %% frequency)
    start <- start(x) + shift
    end <- end(x)

    # summing the data
    newdata <- array(0, length(x)-k+1)
    for (i in 1:(length(x)-k+1)){
      newdata[i] <- sum(x[i:(i+k-1)])
    }

    # transforming into time series
    out <- ts(newdata, start = start, frequency = frequency, end = end)
  }

  if (is.vector(x) == T){
    len <- length(x)
    newdata <- array(NA, len)
    for (i in len:k){
      newdata[i] <- sum(x[(i-k+1):i])
    }
    out <- newdata

  }
  
  return(out)
}

persum2 <- function(x, k){
  # works for a unidimensional time series object or a vector
  
  if (is.ts(x) == T){
    # setting the frames of a new series
    frequency <- frequency(x)
    shift <- c((k-1) %/% frequency, (k-1) %% frequency)
    start <- start(x) + shift
    end <- end(x)
  
    # summing the data
    newdata <- array(0, length(x)-k+1)
    for (i in 1:(length(x)-k+1)){
      newdata[i] <- prod(1 + x[i:(i+k-1)]) - 1
    }
  
    # transforming into time series
    out <- ts(newdata, start = start, frequency = frequency, end = end)
    
  }

  if (is.vector(x) == T){
    len <- length(x)
    newdata <- array(NA, len)
    for (i in len:k){
      newdata[i] <- prod(1 + x[(i-k+1):i]) - 1
    }
    out <- newdata 
  }
  
  return(out)
}


```


## Data import and preparation 
### Reading the data 
We begin by importing the data that we have at our disposition.
```{r}
dataie <- read.table(file = "data/ie_data2018_09.csv",header = T,
                   skip = 7, nrows = 1773,
                   stringsAsFactors = F, sep = ";", dec = ".")

dataie <- data.matrix(dataie[1:1773, 1:11]) # this is done to convert everything into numbers. Is a way to ensure that you won't have string characters due to improper reading of the .csv file. In case at least one of the column entries is a character string, all the column will be coerced to the string format. Il morale della favola: construct your datasets properly!
head(dataie) # displays several first entries of the object (vector, matrix, data frame etc.)
dimnames(dataie)[[2]] #displays the names of the columns if such are present
```
### Data-Frames and Time-Series

Having loaded our dataset, we want to convert it to a format that will be useful for the analysis. There are two competing ways to store the data: using data frames or using arrays.
Arrays permit to store multidimensional arrays of data of the *same type*, while data frames store the data in two dimensions (just like a matrix), allowing though for the storage of different types of data.
We will store the useful data in a data frame, as this format provides several benefits regarding subsetting, plotting and function methods.
Instead, the use of arrays will be done for the intermediate computations, as sometimes it is easier to logically allocate data in multidimensional classification.

Another useful feature we will be making use of is the *time-series* data class. It will be useful for convenient plotting and, sometimes, subsetting.

```{r}
tsdata <- ts(dataie, start = c(1871, 1), frequency = 12, names = dimnames(dataie)[[2]]) # creates a time series object
tsdata <- na.omit(tsdata) # omitting the rows with NA presence
s1 <- window(tsdata[, "P"], start = c(1900, 1), end = c(1950, 12)) # creating a subset of a time series. With time series objects subsetting works in a slightly different way, so be careful
#We can now plot 
plot(s1, ylab = "Price", main = "S&P500 price", col = "blue", lwd = 2) # a hint of how it looks
dygraph(s1, ylab = "Price", main = "S&P500 price")
```
Having constructed a time-series object I can store it as a data-frame. A format useful for transformations 
```{r}
data01 <- as.data.frame(tsdata) # be careful as the data do not possess the time series class any more.
# to have a proper time index, we add another column to the data frame
data01$date <- seq(from = tsp(tsdata)[1], to = tsp(tsdata)[2], length.out = nrow(tsdata))
# now I subset the dataframe to build graphs 
t0 <- which(data01$date == 1900)
t1 <- which(data01$date == 1950)
#We can now plot, please note the difference with plotting from a time-series object

plot(data01$date[t0:t1],data01$P[t0:t1],ylab = "Price", main = "S&P500 price", col = "blue", lwd = 2,type="l")

datagraph<- subset(data01,date > 1900 & date < 1951,select=c(date,P))
dygraph(datagraph, ylab = "Price", main = "S&P500 price")

```

##Data transformations
All the essential data transformations, such as taking logarithms, performing linear transformations or taking the lagged values is performed in an easy way for any arrays[^1] and time series in particular.

[^1]:An array is a multidimensional data structure, whose entries are of the same data type. For reference, see the [data structure typology](http://adv-r.had.co.nz/Data-structures.html)

Now we will implement transformation and appending new variables into the data frame data01.
```{r}

data01$b10Y_ytm <- data01$GS10/100
data01$lp <- log(data01$P)
data01$divm <- data01$D/12
data01$ld <- log(data01$divm)
data01$dp <- data01$divm/data01$P
data01$ldp <- log(data01$dp)
dp_bar <- mean(data01$dp)
rho <- (1/dp_bar)/(1 + 1/dp_bar)
k <- -log(1-rho) - rho*log(1/(1-rho)-1)
```
We are now ready to compute linear returns, log returns and log-linearized total returns. To this end please remember the following formula from the lecture notes: 

$$ r_{t+1}^{s} = \kappa + \rho (p_{t+1}-d_{t+1}) + \Delta d_{t+1} - (p_{t}-d_{t})$$

```{r}
# log monthly stock returns
data01$ret_m_lw <- c(NA,diff(data01$lp))
data01$ret_m_lc <- log((data01$P+data01$divm)/(lag(data01$P, 1)))
data01$ret_m_lc_a <- data01$ret_m_lc*12
data01$ret_m_ll <- k - rho*data01$ldp+(data01$ld - lag(data01$ld, 1)) + lag(data01$ldp, 1)

# linear monthly stock returns
data01$ret_m_w <- data01$P/lag(data01$P, 1)-1
data01$ret_m_c <- (data01$P+data01$divm)/lag(data01$P, 1) - 1

# inflation
data01$infl_m <- (data01$CPI - lag(data01$CPI, 1))/lag(data01$CPI, 1)
data01$infl_a <- (data01$CPI - lag(data01$CPI, 12))/lag(data01$CPI, 12)
data01$infl_10y <- (data01$CPI - lag(data01$CPI, 120))/lag(data01$CPI, 120)

# real returns
data01$ret_m_wr <- (1+data01$ret_m_w)/(1+data01$infl_m) - 1
data01$ret_m_cr <- (1+data01$ret_m_c)/(1+data01$infl_m) - 1

# log monthly returns on long term bonds
data01$b10Y_ytm_ll <- log(1+data01$b10Y_ytm/12)
data01$rho_b <- 1/(1+data01$b10Y_ytm)
data01$dur <- 12*(1 - data01$rho_b^10)/(1 - data01$rho_b)
data01$ret_b10Y_m_ll <- lag(data01$dur, 1) * lag(data01$b10Y_ytm_ll, 1) - (lag(data01$dur, 1) - 1) * data01$b10Y_ytm_ll

# returns at different frequencies
## returns in logarithms
data01$ret_1y_lc <- persum(data01$ret_m_lc, 12)  
data01$ret_10y_lc <- persum(data01$ret_m_lc, 120) 
data01$ret_10y_lca <- data01$ret_10y_lc*0.1 

## exact returns
data01$ret_1y_c <- (1+data01$ret_m_c) * (1+lag(data01$ret_m_c, 1)) * (1+lag(data01$ret_m_c,2)) * (1+lag(data01$ret_m_c, 3)) * (1+lag(data01$ret_m_c, 4)) * (1+lag(data01$ret_m_c, 5)) * (1+lag(data01$ret_m_c, 6)) * (1+lag(data01$ret_m_c, 7)) * (1+lag(data01$ret_m_c, 8)) * (1+lag(data01$ret_m_c, 9)) * (1+lag(data01$ret_m_c, 10)) * (1+lag(data01$ret_m_c, 11)) - 1
data01$ret_1y_cr <- (1 + data01$ret_1y_c)/(1 + data01$infl_a) - 1

data01$ret_10y_c <- (1+data01$ret_1y_c) * (1+lag(data01$ret_1y_c, 12)) * (1+lag(data01$ret_1y_c,24)) * (1+lag(data01$ret_1y_c, 36)) * (1+lag(data01$ret_1y_c, 48)) * (1+lag(data01$ret_1y_c, 60)) * (1+lag(data01$ret_1y_c, 72)) * (1+lag(data01$ret_1y_c, 84)) * (1+lag(data01$ret_1y_c, 96)) * (1+lag(data01$ret_1y_c, 108)) - 1
data01$ret_10y_cr <- (1 + data01$ret_10y_c)/(1 + data01$infl_10y) - 1
data01$ret_10y_ca <- (1 + data01$ret_10y_c)^0.1 - 1

```
##Descriptive Data Analysis

We start from descriptive statistics 
```{r}

# select the relevant subset fot the analysis 
datashow<- subset(data01,date >1900 & date < 1951,select=c(ret_m_w,ret_m_c,infl_m,ldp))

summary(datashow) # this is very useful to get a grip on the data structure
mean(datashow[,"ret_m_w"])
sd(datashow[,"ret_m_w"])
var(datashow[,"ret_m_w"])
cor(datashow)
cor.datacor = cor(datashow, use="complete.obs")
cor.datacor

## ------------------------------------------------------------------------
ord <- order(cor.datacor[1,])
ordered.cor.datacor <- cor.datacor[ord, ord]
plotcorr(ordered.cor.datacor, col=cm.colors(11)[5*ordered.cor.datacor + 6])


## ------------------------------------------------------------------------
cormat <- round(cor(datashow),2)
head(cormat)
melted_cormat <- melt(cormat)
head(melted_cormat)
ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) + 
  geom_tile()
# Get lower triangle of the correlation matrix
get_lower_tri<-function(cormat){
  cormat[upper.tri(cormat)] <- NA
  return(cormat)
}
# Get upper triangle of the correlation matrix
get_upper_tri <- function(cormat){
  cormat[lower.tri(cormat)]<- NA
  return(cormat)
}
upper_tri <- get_upper_tri(cormat)
upper_tri
# Melt the correlation matrix
melted_cormat <- melt(upper_tri, na.rm = TRUE)
# Heatmap
ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+
  geom_tile(color = "white")+
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name="Pearson\nCorrelation") +
  theme_minimal()+ 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                   size = 12, hjust = 1))+
  coord_fixed()

```

##Graphical Data Analysis

We move now to some graphical data analysis  

### Time-series plot of returns

```{r}
t0 <- which(data01$date == 1900)
t1 <- which(data01$date == 1950)

plot(data01$date[t0:t1],data01$ret_m_w[t0:t1], col = 'blue', type = "l", 
     ylab = "return without div", xlab = "date")
lines(y = rep(mean(data01$ret_m_w[t0:t1], na.rm = T), length(data01$ret_m_w[t0:t1])), x = data01$date[t0:t1], col = "red")
lines(y = data01$ret_m_c[t0:t1], x = data01$date[t0:t1], col = "green")
```

### time-series plot of Cumulative Returns

```{r}
t0 <- which(data01$date == 1950)
t1 <- which(data01$date == 2018)
# buy and hold returns
## what would happen had we invested $1 in the S&P500 in the beginning of 1960
## initializing values
data01$w <- data01$c <- data01$infl <- data01$wr <- data01$cr <- array(data = NA, dim = nrow(data01))

data01[t0, c("w", "c", "infl", "wr", "cr")] <- 1 # don't need to repeat the value to make the array being assigned be of the same length. be careful though as it is one of the few cases of exception
for (i in (t0+1):(t1-1)) {
  data01$w[i] <- data01$w[i-1] * (1 + data01$ret_m_w[i])
  data01$c[i] <- data01$c[i-1] * (1 + data01$ret_m_c[i])
  data01$wr[i] <- data01$wr[i-1] * (1 + data01$ret_m_wr[i])
  data01$cr[i] <- data01$cr[i-1] * (1 + data01$ret_m_cr[i])
  data01$infl[i] <- data01$infl[i-1] * (1 + data01$infl_m[i])
}
plot(data01$date[t0:t1],data01$w[t0:t1],
     type = "l", col = "red", ylim = c(0, 2000),
     ylab = "cumulative return no div", xlab="year")
lines(y =data01$c[t0:t1] , x = data01$date[t0:t1], col = "blue",type = "l", 
     ylab = "return cum div")
lines(y =data01$infl[t0:t1] , x = data01$date[t0:t1], col = "green",type = "l", 
     ylab = "return cum div")
datagraph<- subset(data01,date > 1950 & date < 2018.1,select=c(date,w,c,infl))
dygraph(datagraph, main = "Cumulative Returns")

```

### More  plots

```{r}
# some standard2 plots
# reshaping the series first
plotdata <- ts(data01, start = c(1881, 1), frequency = 12)
start <- c(1960, 1)
end <- c(2015, 12)
s1 <- window(plotdata[, "ret_m_cr"], start = start, end = end)
s2 <- window(plotdata[, "ret_1y_cr"], start = start, end = end)
s3 <- window(plotdata[, "ret_10y_cr"], start = start, end = end)


plot(s1, type = "l", main = "US 1-month stock market returns", ylab = "return") # plot of the series plus the confidence bands.
Z <- array(1, dim = c(length(s1), 1))
lines(y = rep(mean(s1), length(s1)), x = seq(tsp(s1)[1], tsp(s1)[2], length.out = length(s1)), col = "red")
lines(y = rep(mean(s1) + 2*sd(s1), length(s1)), x = seq(tsp(s1)[1], tsp(s1)[2], length.out = length(s1)), col = "red", lty = 2)
lines(y = rep(mean(s1) - 2*sd(s1), length(s1)), x = seq(tsp(s1)[1], tsp(s1)[2], length.out = length(s1)), col = "red", lty = 2)

plot(s2, type = "l", main = "US 1-year stock market returns", ylab = "return") # plot of the series plus the confidence bands.
Z <- array(1, dim = c(length(s2), 1))
lines(y = rep(mean(s2), length(s2)), x = seq(tsp(s2)[1], tsp(s2)[2], length.out = length(s2)), col = "red")
lines(y = rep(mean(s2) + 2*sd(s2), length(s2)), x = seq(tsp(s2)[1], tsp(s2)[2], length.out = length(s2)), col = "red", lty = 2)
lines(y = rep(mean(s2) - 2*sd(s2), length(s2)), x = seq(tsp(s2)[1], tsp(s2)[2], length.out = length(s2)), col = "red", lty = 2)

plot(s3, type = "l", main = "US 10-year stock market returns", ylab = "return") # plot of the series plus the confidence bands.
Z <- array(1, dim = c(length(s3), 1))
lines(y = rep(mean(s3), length(s3)), x = seq(tsp(s3)[1], tsp(s3)[2], length.out = length(s3)), col = "red")
lines(y = rep(mean(s3) + 2*sd(s3), length(s3)), x = seq(tsp(s3)[1], tsp(s3)[2], length.out = length(s3)), col = "red", lty = 2)
lines(y = rep(mean(s3) - 2*sd(s3), length(s3)), x = seq(tsp(s3)[1], tsp(s3)[2], length.out = length(s3)), col = "red", lty = 2)

plot(s1^2, type = "l", col = "blue", main = "US squared stock market return", ylab = "squared return")

plot(data01$c[949:1626], col = "red", type = "l", main = "comparative performance", ylab = "cumulative impact")
lines(data01$w[949:1626], col = "blue")
lines(data01$infl[949:1626], col = "green")
legend("topleft", legend = c("SM with reinvestment", "SM without reinvestment", "inflation"), col = c("red", "blue", "green"), lty = 1)

ldp1 <- window(plotdata[, "ldp"], start = c(1959, 12), end = c(2015, 11))
cape1 <- window(plotdata[, "CAPE"], start = c(1959, 12), end = c(2015, 11))
lm1 <- lm(s1 ~ ldp1 + cape1)
plot(s1, type = "l", main = "returns vs fundamentals. 1-month", ylab = "return", lty = 2) # plot of the series agains the predictable part
Z <- array(1, dim = c(length(s1), 1))
lines(y = rep(mean(s1), length(s1)), x = seq(tsp(s1)[1], tsp(s1)[2], length.out = length(s1)), col = "red")
lines(y = lm1$fitted.values, x = seq(tsp(s1)[1], tsp(s1)[2], length.out = length(s1)), col = "blue", lwd = 2)
legend("bottomleft", legend = c("1-month return", "mean", "fundamentals"), col = c("black", "red", "blue"), lty = c(2, 1, 1), lwd = c(1, 1, 2))

ldp2 <- window(plotdata[, "ldp"], start = c(1959, 1), end = c(2014, 12))
cape2 <- window(plotdata[, "CAPE"], start = c(1959, 1), end = c(2014, 12))
lm2 <- lm(s2 ~ ldp2 + cape2)
plot(s2, type = "l", main = "returns vs fundamentals. 1-year", ylab = "return", lty = 2) # plot of the series agains the predictable part
Z <- array(1, dim = c(length(s2), 1))
lines(y = rep(mean(s2), length(s2)), x = seq(tsp(s2)[1], tsp(s2)[2], length.out = length(s2)), col = "red")
lines(y = lm2$fitted.values, x = seq(tsp(s2)[1], tsp(s2)[2], length.out = length(s2)), col = "blue", lwd = 2)
legend("bottomleft", legend = c("1-year return", "mean", "fundamentals"), col = c("black", "red", "blue"), lty = c(2, 1, 1), lwd = c(1, 1, 2))

ldp3 <- window(plotdata[, "ldp"], start = c(1950, 1), end = c(2005, 12))
cape3 <- window(plotdata[, "CAPE"], start = c(1950, 1), end = c(2005, 12))
lm3 <- lm(s3 ~ ldp3 + cape3)
plot(s3, type = "l", main = "returns vs fundamentals. 10-years", ylab = "return", lty = 1) # plot of the series agains the predictable part
Z <- array(1, dim = c(length(s3), 1))
lines(y = rep(mean(s3), length(s3)), x = seq(tsp(s3)[1], tsp(s3)[2], length.out = length(s3)), col = "red")
lines(y = lm3$fitted.values, x = seq(tsp(s3)[1], tsp(s3)[2], length.out = length(s3)), col = "blue", lwd = 2)
legend("bottomleft", legend = c("10-year return", "mean", "fundamentals"), col = c("black", "red", "blue"), lty = c(2, 1, 1), lwd = c(1, 1, 2))


# one can also combine  several plots on one canvas
par(mfrow = c(2, 2))
start <- c(1960, 1)
end <- c(2015, 12)
plot(window(plotdata[, "ret_1y_c"], start, end), type = "l", ylab = "1-year return")
plot(window(plotdata[, "ret_1y_c"], start, end), window(plotdata[, "ret_10y_c"], start, end),
     ylab = "10-year return", xlab = "1-year return")
abline(reg = lm(window(plotdata[, "ret_10y_c"], start, end) ~ window(plotdata[, "ret_1y_c"], start, end)), col  = "red")
plot(y = window(plotdata[, "ret_1y_c"], start, end), x = window(plotdata[, "ret_10y_c"], start, end),
     ylab = "1-year return", xlab = "10-year return")
abline(reg = lm(window(plotdata[, "ret_1y_c"], start, end) ~ window(plotdata[, "ret_10y_c"], start, end)), col  = "red")
plot(window(plotdata[, "ret_10y_c"], start, end), type = "l", ylab = "10-year return")
par(mfrow = c(1, 1))

plot(window(plotdata[, "ret_10y_ca"], start = c(1900, 1), end = c(2015, 12)),
     type = "l", col = "red", ylim = c(-0.05, 0.25),
     ylab = "return")
lines(window(plotdata[, "b10Y_ytm"], start = c(1890, 1), end = c(2005, 12)), 
      x = seq(from = 1900, to = 2016, length.out = 116*12), col = "blue")
lines(y = rep(0, 1000), x = seq(1800, 2020, length.out = 1000))
legend("topleft", legend = c("stocks cumulative dividend", "10Y bonds"),
       col = c("red", "blue"), lty = 1)


```

### Histograms

```{r}

hist(s1, breaks = seq(min(s1), max(s1), l = 20+1),prob=TRUE, main = "histogram of monthly returns")
curve(dnorm(x,mean=mean(s1),sd=sd(s1)),col='darkblue',lwd=2,add=TRUE)
```

### Q-Q plots

```{r}
start <- c(1948, 2)
end <- c(2014, 6)
qqplot(window(plotdata[, "ret_m_c"], start = start, end = end),
       window(plotdata[, "ret_m_lc"], start = start, end = end),
       ylim = c(-0.15,0.15), xlim = c(-0.15,0.15),
       ylab = "monthly return. log approximation",
       xlab = "monthly return. exact computation",
       main = "Quantile-Quantile plot (Q-Q plot)")
mod5 <- lm(window(plotdata[, "ret_m_lc"], start = start, end = end) ~ -1 + window(plotdata[, "ret_m_c"], start = start, end = end))
abline(reg = mod5, col = "red")
# qq-plot versus normal dist 
qqnorm(window(plotdata[, "ret_m_c"], start = start, end = end),
       ylim = c(-0.15,0.15),ylab = "monthly return. sample quantile",
       xlab = "monthly return. theoretical quantiles",
       main = "Normal (Q-Q plot)")
qqline(window(plotdata[, "ret_m_c"], start = start, end = end), datax = FALSE, distribution = qnorm,
       probs = c(0.25, 0.75), qtype = 7)



```


## Saving the dataset with all the transformations!
```{r}
write.csv(x = data01, file = "data/data01.csv")

```

## Matrix operations
R also permits to perform various matrix algebra operations. In R, a matrix is defined as an array which has precisely two dimensions.
```{r}
ones <- array(1, dim = c(24, 1)) # creates a matrix of dimension 24 X 1, where all the entries are equal to 1
is.matrix(ones) # in case things don't work, you can always assure yourself whether you are working with a matrix or not
x1 <- as.matrix(subset(data01, select = ret_m_lc, date >= 2014.5-0.0001 & date <= 2016.5)) # extracting a part of the series
x2 <- as.matrix(subset(data01, select = ret_m_lc_a, date >= 2014.5-0.0001 & date <= 2016.5)) # to be sure of the proper formatting, we explicitly tell R to regard the sample as a matrix. (try without it and see that is does not work)
x <- cbind(x1, x2) # here we give the necessary conformable shaape to our matrix
X <- cbind(ones, x) # be always aware that R is case-sensitive
XX <- t(X) %*% X # attention to the special way to use the matrix multiplication operator.
try(solve(XX)) # multicollinrearity at play !!
qr(XX)$rank # check the rank of the matrix

# There are several approaches to get the statistics you need
solve(t(ones) %*% ones) %*% t(ones) %*% x # one way to find the means ( which are also the the projections on the space spanned by the vector of constants )
colMeans(x) # computes means by columns
c(mean(x1), mean(x2)) # computes the means separately and then joins them
apply(X = x, MARGIN = 2, mean) # an efficient command that applies different functions to the specified array dimensions

sigma_ret <- sd(x1)
sigma_ret2 <- sqrt(XX[2, 2]/XX[1, 1] - colMeans(x)[1]^2) # why do the two numbers differ?? (hint: degrees of freedom)
sigma_reta2 <- sqrt(XX[3, 3]/XX[1, 1] - colMeans(x)[2]^2)
cov_rra <- XX[3, 2]/24 - colMeans(x)[1]*colMeans(x)[2]
cor_rra <- cov_rra/(sigma_ret2 * sigma_reta2)
cor_rra

# obtaining the covariance matrix
Xvcov <- XX/XX[1 ,1] - colMeans(X) %*% t(colMeans(X))
Xvcov
cov(X) # again, the difference is just due to the degrees of freedom correction

```