rm(list = ls())
#setwd(path)
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
# packages used
listofpackages <- c("quantmod","PerformanceAnalytics","xlsx")

for (j in listofpackages){
  if(sum(installed.packages()[, 1] == j) == 0) {
    install.packages(j)
  }
  library(j, character.only = T)
}


# DOWNLOAD DATA FROM YAHOO 

tickers <- c('^GSPC','^DJI','^IRX') 


#download the historical prices
getSymbols.yahoo(tickers,
                 env = globalenv(),
                 index.class ='Date',
                 from = "1990-01-01",
                 to = "2019-10-31",
                 periodicity ="monthly")

stocks = merge(GSPC[,6],DJI[,6],IRX[,6])
colnames(stocks) <- c('SP500','DJI','3m_rate')
write.csv(as.data.frame(stocks), "stocks_1019.csv", row.names = TRUE)


# import data from FRED database 
# set tickers
tickers1 <- c("FEDFUNDS", "DGS10", "GDPPOT")
getSymbols.FRED(tickers1,
                 env = globalenv(),
                 return.class = "xts",
                 index.class ='Date'
                 )
# import data from FRED database
quarterly_fedfunds <- apply.quarterly(FEDFUNDS, last)
macro_fred = merge(FEDFUNDS[,1],DGS10[,1])
head(macro_fred$GDPPOT)

#-------------------------------------------------------------------------------
#DOWNLOAD FROM URL 
#-------------------------------------------------------------------------------

# install and load the relevant packages 
# packages used
listofpackages <- c("ellipse","reshape2","ggplot2","dygraphs", "dplyr","forecast", "aod","readr","xlsx")

for (j in listofpackages){
  if(sum(installed.packages()[, 1] == j) == 0) {
    install.packages(j)
  }
  library(j, character.only = T)
}


urlfile="https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-regioni/dpc-covid19-ita-regioni.csv"
COVID_all<-read_csv(url(urlfile)) #read_csv in package tidyverse
#COVID_all<-read.csv(url(urlfile)) #read.csv in package xlxs
LOMBARDIA=subset(COVID_all,denominazione_regione=="Lombardia",select=c(1:1,4:4,7:20))
VENETO=subset(COVID_all,denominazione_regione=="Veneto",select=c(1:1,4:4,7:20))

write.csv(LOMBARDIA, "LOMBARDIA.csv")
write.csv(VENETO, "VENETO.csv")

REGIONS <- COVID_all[11908:11928,1:20]
POP = c(1311580, 562869,1947131,5801692,4459477,1215220,5879082,1550640,10060574,1525271,533373,305617,4356406,4029053,1639591,4999891,3729641,541380,882015,125666,4905854)
AREA=c(10795,9992,15080,13595,22451,7907,17207,5421,23861,9366,7398,4438,25399,19363,24090,25707,22993,6207,8456,3263,18316)
REGIONS$POP=POP
REGIONS$AREA=AREA
REGIONS$LETHALITY=(REGIONS$deceduti/REGIONS$POP)*1000000

REGIONS$DENSITY=REGIONS$POP/REGIONS$AREA
REGIONS$TOTCASI_R=(REGIONS$totale_casi/REGIONS$POP)*1000000


p <- ggplot(REGIONS, aes(DENSITY,LETHALITY)) + geom_point(colour="green",size=3) + geom_text(aes(label=denominazione_regione),hjust=0, vjust=0,colour="blue")
# Add regression line
p + geom_smooth(method = lm,colour="red")

p <- ggplot(REGIONS, aes(DENSITY,deceduti)) + geom_point(colour="green",size=3) + geom_text(aes(label=denominazione_regione),hjust=0, vjust=0,colour="blue")
# Add regression line
p + geom_smooth(method = lm,colour="red")

p <- ggplot(REGIONS, aes(DENSITY,TOTCASI_R)) + geom_point(colour="green",size=3) + geom_text(aes(label=denominazione_regione),hjust=0, vjust=0,colour="blue")
# Add regression line
p + geom_smooth(method = lm,colour="red")
p <- ggplot(REGIONS, aes(DENSITY,totale_casi)) + geom_point(colour="green",size=3) + geom_text(aes(label=denominazione_regione),hjust=0, vjust=0,colour="blue")
# Add regression line
p + geom_smooth(method = lm,colour="red")


#--------------------------------------------------------------------------------

# This part of the script t scrapes top-level NBA schedule and results from basketball-reference.com.
# User can set year and list of months to determine the window of games to scrape.
# At the end of the script, I reconstruct the conference standings based on W-L
# percentage.
### https://www.r-bloggers.com/2018/12/scraping-nba-game-data-from-basketball-reference-com/

listofpackages <- c("rvest","lubridate")

for (j in listofpackages){
  if(sum(installed.packages()[, 1] == j) == 0) {
    install.packages(j)
  }
  library(j, character.only = T)
}


########
# PARAMETERS
########
year <- "2018"
monthList <- c("october", "november", "december", "january", "february", 
               "march", "april", "may", "june")
playoff_startDate <- ymd("2018-04-14")
outputfile <- "NBA-2018_game_data.rds"

########
# SCRIPT FOR SCRAPING DATA STARTS HERE
########
df <- data.frame()
for (month in monthList) {
  # get webpage
  url <- paste0("https://www.basketball-reference.com/leagues/NBA_", year, 
                "_games-", month, ".html")
  webpage <- read_html(url)
  
  # get column names
  col_names <- webpage %>% 
    html_nodes("table#schedule > thead > tr > th") %>% 
    html_attr("data-stat")    
  col_names <- c("game_id", col_names)
  
  # extract dates column
  # note that in april, there is a break in the table which just says 
  # "Playoffs". this messes with the data merging later, so we get rid of it
  dates <- webpage %>% 
    html_nodes("table#schedule > tbody > tr > th") %>% 
    html_text()
  dates <- dates[dates != "Playoffs"]
  
  # extract game id
  # we need to remove the NA that is due to the "Playoffs" row in april
  game_id <- webpage %>% 
    html_nodes("table#schedule > tbody > tr > th") %>%
    html_attr("csk")
  game_id <- game_id[!is.na(game_id)]
  
  # extract all columns (except date)
  data <- webpage %>% 
    html_nodes("table#schedule > tbody > tr > td") %>% 
    html_text() %>%
    matrix(ncol = length(col_names) - 2, byrow = TRUE)
  
  # combine game IDs, dates and columns in dataframe for this month, add col names
  month_df <- as.data.frame(cbind(game_id, dates, data), stringsAsFactors = FALSE)
  names(month_df) <- col_names
  
  # add to overall dataframe
  df <- rbind(df, month_df)
}

# change columns to the correct types
df$visitor_pts <- as.numeric(df$visitor_pts)
df$home_pts    <- as.numeric(df$home_pts)
df$attendance  <- as.numeric(gsub(",", "", df$attendance))
df$date_game   <- mdy(df$date_game)

# add column to indicate if regular season or playoff
df$game_type <- with(df, ifelse(date_game >= playoff_startDate, 
                                "Playoff", "Regular"))

# drop boxscore column
df$box_score_text <- NULL

# save to file
#saveRDS(df, outputfile)

########
# SCRIPT FOR RANKING TABLE STARTS HERE
########

# get winner and loser of each game
df$winner <- with(df, ifelse(visitor_pts > home_pts, 
                             visitor_team_name, home_team_name))
df$loser <- with(df, ifelse(visitor_pts < home_pts, 
                            visitor_team_name, home_team_name))

# build up standings table for regular season
regular_df <- subset(df, game_type == "Regular")
teams <- sort(unique(regular_df$visitor_team_name))
standings <- data.frame(team = teams, stringsAsFactors = FALSE)

# conference & division information: manually input
standings$conf <- c("East", "East", "East", "East", "East",
                    "East", "West", "West", "East", "West",
                    "West", "East", "West", "West", "West",
                    "East", "East", "West", "West", "East",
                    "West", "East", "East", "West", "West",
                    "West", "West", "East", "West", "East")
standings$div <- c("Southeast", "Atlantic", "Atlantic", "Southeast", "Central",
                   "Central", "Southwest", "Northwest", "Central", "Pacific",
                   "Southwest", "Central", "Pacific", "Pacific", "Southwest",
                   "Southeast", "Central", "Northwest", "Southwest", "Atlantic",
                   "Northwest", "Southeast", "Atlantic", "Pacific", "Northwest",
                   "Pacific", "Southwest", "Atlantic", "Northwest", "Southeast")

# populate W-L column, W pct
standings$win <- 0; standings$loss <- 0
for (i in 1:nrow(standings)) {
  standings$win[i]  <- sum(regular_df$winner == standings$team[i])
  standings$loss[i] <- sum(regular_df$loser  == standings$team[i])
}
standings$wl_pct <- with(standings, win / (win + loss))

# Eastern conference standings
east_standings <- subset(standings, conf == "East")
east_standings[with(east_standings, order(-wl_pct, team)), 
               c("team", "win", "loss")]

# Western conference standings
west_standings <- subset(standings, conf == "West")
west_standings[with(west_standings, order(-wl_pct, team)), 
               c("team", "win", "loss")]
