## ----setup, include=FALSE----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE)


## ----------------------------------------------------------------------------
#clear the environment 
rm(list=ls()) 
## ------------------------------------------------------------------------
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

# packages used
listofpackages <- c("dygraphs", "dplyr","ellipse","reshape2","ggplot2","highcharter","xts","xlsx","readxl",
"quantmod","foreign")

for (j in listofpackages){
  if(sum(installed.packages()[, 1] == j) == 0) {
    install.packages(j)
  }
  library(j, character.only = T)
}



## ----------------------------------------------------------------------------

args(read.table)
data_readtable1=read.table("demo_data.txt",sep="\t",header=TRUE)
head(data_readtable1) 
str(data_readtable1)


## ----------------------------------------------------------------------------
args(read.csv)
data_readcsv=read.csv("demo_data.csv")
head(data_readcsv)
str(data_readcsv)


## ----------------------------------------------------------------------------
data(iris)
head(iris)
str(iris)


## ----------------------------------------------------------------------------
#SPSS
data_SPSS=read.spss("personnel.sav")
head(data_SPSS)
#data are read as a list then convert this list into a dataframe
data_SPSS=as.data.frame(data_SPSS)
head(data_SPSS)
#STATA
data_Stata=read.dta("Income.dta")
head(data_Stata)


## ----------------------------------------------------------------------------
raw_data           = read_xlsx("2023_monthly_stocks.xlsx") 
names(raw_data)[1] = 'Date'
typeof(raw_data)
typeof(raw_data$Date)
typeof(raw_data$AXP)
typeof(raw_data$CSCO)
str(raw_data)

dates <-seq(as.Date("1985-02-01"),length=462, by="months")
params <- c("Date","AXP","AMGN","AAPL","BA","CAT","CSCO", "DJI")
data <- raw_data[, c(params)]
data<- na.omit(data)
data <- data %>% 
  mutate(Date = as.Date(Date, format = "%Y-%m-%d"))

params1 <- c("AXP","AMGN","AAPL","BA","CAT","CSCO", "DJI")
tsdata <- xts(raw_data[, c(params1)], order.by=dates) # creates a time series object
tsdata <- na.omit(tsdata) # omitting the rows with NA presence
data<- na.omit(data)
## having created the database with all observation we generate a subset 
#tsdata1 <- tsdata["1992-02-01/1993-02-01"]
#data=subset(data,select=c(1:12))


## ----------------------------------------------------------------------------
data_stocks=read.csv(file="us_stocks.csv",header=TRUE)
head(data_stocks)
names(data_stocks)
colnames(data_stocks)

msft_prices1=data_stocks$MSFT #the data is returned as a vector
head(msft_prices1)
msft_prices2=data_stocks[["MSFT"]]
#the data is returned as a vector
head(msft_prices2)
#the following returns data as a data frame
msft_prices3=data_stocks["MSFT"]
#can also be used to access multiple columns
head(msft_prices3)


## ------------------------------------------------------------------------
#MSFT is in the second column and leaving the row index blank returns all the rows for the particular column

msft_prices4=data_stocks[,2]

head(msft_prices4)
#all the elements in row 4
data_stocks[4,]

## ------------------------------------------------------------------------
#First create a vector having the returns for msft
msft_ret=100*diff(log(data_stocks$MSFT)) 
#combine the vector with the data 
#data_stocks_r=cbind(data_stocks,MSFT_RET=msft_ret)
#this will generate an error message 
#different length 
length(msft_ret) 
length(data_stocks$MSFT)
#add one more value to vector msft_ret 
msft_ret=c(0,msft_ret)
#check the length 
length(msft_ret)
#lets combine now (it should work)
data_stocks_r=cbind(data_stocks,MSFT_RET=msft_ret)
head(data_stocks_r)#shows one more column added to the data

## ------------------------------------------------------------------------
#create two dataframes from data_stocks
data_r1=data_stocks[1:10,]#first 10 rows
data_r2=data_stocks[2775:2784,]#last 10 rows
data_stocks_rbind=rbind(data_r1,data_r2)
print(data_stocks_rbind)

## ------------------------------------------------------------------------
#select all rows with Apple prices above 100
data_aaplgr100=data_stocks[data_stocks$AAPL>100,] 
head(data_aaplgr100)
min(data_aaplgr100$AAPL) #check if the prices are above 100
#this give NA as the minimum which indicates that data frame has NA 
#lets remove NAs from data_aaplgr100 using na.omit function
data_aaplgr100=na.omit(data_aaplgr100)
#now check the minimum again
min(data_aaplgr100$AAPL)

## ------------------------------------------------------------------------
head(data_stocks)#notice NAs in GOOG
data_stocks_googlena=data_stocks[!is.na(data_stocks$GOOG),]
head(data_stocks_googlena)#after removing NAs
#the above can still leave NAs in other columns
#use na.omit to remove all the blank data
data_stocks_naomit=na.omit(data_stocks)

## ------------------------------------------------------------------------
data_msft=data_stocks_naomit[data_stocks_naomit$MSFT<=30&data_stocks_naomit$MSFT>20,]
min(data_msft$MSFT)#check 

## ------------------------------------------------------------------------
args(subset.data.frame)
aaplgr100=subset(data_stocks_naomit,AAPL>100)
head(aaplgr100)
min(aaplgr100$AAPL)



## ----------------------------------------------------------------------------
save(data_readtable1,file="data1.Rdata")
 #saving data into another text file
write.table(data_readtable1,file="data1.txt")
load("data1.Rdata")#using load to load R data 
head(data_readtable1)