1. Data Exploration: This should include summary statistics, means, medians, quartiles, or any other relevant information about the data set. Please include some conclusions in the R Markdown text.
#install.packages("ggplot2")
require(ggplot2)
## Loading required package: ggplot2
#Loads the data from URL. Credited data creator in github description. Basic # of rows and Columns. This is S&P Stock data

  urldata <- read.csv("https://raw.githubusercontent.com/jrovalino/RWk3FinalProject/master/Smarket.csv", header = TRUE, sep = ",")
#urldata
nrow(urldata)
## [1] 1250
ncol(urldata)
## [1] 10
str(urldata)
## 'data.frame':    1250 obs. of  10 variables:
##  $ X        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Year     : int  2001 2001 2001 2001 2001 2001 2001 2001 2001 2001 ...
##  $ Lag1     : num  0.381 0.959 1.032 -0.623 0.614 ...
##  $ Lag2     : num  -0.192 0.381 0.959 1.032 -0.623 ...
##  $ Lag3     : num  -2.624 -0.192 0.381 0.959 1.032 ...
##  $ Lag4     : num  -1.055 -2.624 -0.192 0.381 0.959 ...
##  $ Lag5     : num  5.01 -1.055 -2.624 -0.192 0.381 ...
##  $ Volume   : num  1.19 1.3 1.41 1.28 1.21 ...
##  $ Today    : num  0.959 1.032 -0.623 0.614 0.213 ...
##  $ Direction: Factor w/ 2 levels "Down","Up": 2 2 1 2 2 2 1 2 2 2 ...

#Getting stats on the data and performing data exploration:

summary(urldata)
##        X               Year           Lag1                Lag2          
##  Min.   :   1.0   Min.   :2001   Min.   :-4.922000   Min.   :-4.922000  
##  1st Qu.: 313.2   1st Qu.:2002   1st Qu.:-0.639500   1st Qu.:-0.639500  
##  Median : 625.5   Median :2003   Median : 0.039000   Median : 0.039000  
##  Mean   : 625.5   Mean   :2003   Mean   : 0.003834   Mean   : 0.003919  
##  3rd Qu.: 937.8   3rd Qu.:2004   3rd Qu.: 0.596750   3rd Qu.: 0.596750  
##  Max.   :1250.0   Max.   :2005   Max.   : 5.733000   Max.   : 5.733000  
##       Lag3                Lag4                Lag5         
##  Min.   :-4.922000   Min.   :-4.922000   Min.   :-4.92200  
##  1st Qu.:-0.640000   1st Qu.:-0.640000   1st Qu.:-0.64000  
##  Median : 0.038500   Median : 0.038500   Median : 0.03850  
##  Mean   : 0.001716   Mean   : 0.001636   Mean   : 0.00561  
##  3rd Qu.: 0.596750   3rd Qu.: 0.596750   3rd Qu.: 0.59700  
##  Max.   : 5.733000   Max.   : 5.733000   Max.   : 5.73300  
##      Volume           Today           Direction 
##  Min.   :0.3561   Min.   :-4.922000   Down:602  
##  1st Qu.:1.2574   1st Qu.:-0.639500   Up  :648  
##  Median :1.4229   Median : 0.038500             
##  Mean   :1.4783   Mean   : 0.003138             
##  3rd Qu.:1.6417   3rd Qu.: 0.596750             
##  Max.   :3.1525   Max.   : 5.733000
#Mean and Median for "Volumes" vector in the data.frame urldata for Volume Column
YrsofData <-unique(urldata$Year)
YrsofData
## [1] 2001 2002 2003 2004 2005
data2001 <- subset(urldata, Year == 2001)
#data2001
#2001 Mean and Median
Vol2001MeanRow <- mean((data2001[["Volume"]]))
Vol2001MeanRow
## [1] 1.22695
Vol2001MedianRow<-median(c(data2001[,8]))
Vol2001MedianRow
## [1] 1.2174
data2002 <- subset(urldata, Year == 2002)
#data2002
#2002 Mean and Median
Vol2002MeanRow <- mean((data2002[["Volume"]]))
Vol2002MeanRow
## [1] 1.428451
Vol2002MedianRow<-median(c(data2002[,8]))
Vol2002MedianRow
## [1] 1.3786
data2003 <- subset(urldata, Year == 2003)
#data2003
#2003 Mean and Median
Vol2003MeanRow <- mean((data2003[["Volume"]]))
Vol2003MeanRow
## [1] 1.384693
Vol2003MedianRow<-median(c(data2003[,8]))
Vol2003MedianRow
## [1] 1.4
data2004 <- subset(urldata, Year == 2004)
#data2004
#2004 Mean and Median
Vol2004MeanRow <- mean((data2004[["Volume"]]))
Vol2004MeanRow
## [1] 1.424159
Vol2004MedianRow<-median(c(data2004[,8]))
Vol2004MedianRow
## [1] 1.4227
data2005 <- subset(urldata, Year == 2005)
#data2005
#2005 Mean and Median
Vol2005MeanRow <- mean((data2005[["Volume"]]))
Vol2005MeanRow
## [1] 1.917298
Vol2005MedianRow<-median(c(data2005[,8]))
Vol2005MedianRow
## [1] 1.896905
#Total Mean and Median
TotVolMeanRow <- mean((urldata[["Volume"]]))
TotVolMeanRow
## [1] 1.478305
TotVolMedianRow<-median(c(urldata[,8]))
TotVolMedianRow
## [1] 1.42295
Perc2005Better<-(Vol2005MeanRow/TotVolMeanRow)
Perc2005Better
## [1] 1.296957
Perc2001Worse<-1-(Vol2001MeanRow/TotVolMeanRow)
Perc2001Worse
## [1] 0.1700295

#Volume Analysis: The Volume mean across the years(2001-2005 respectively): 1.22695, 1.428451, 1.384693, 1.424159, 1.917298 Total Volume mean: 1.478305 The volume mean in 2005 was the highest and 29% higher than the total mean (2001-2005). The volume mean in 2001 was the lowest and 17% lower than the total mean (2001-2005).

  1. Data wrangling: Please perform some basic transformations. They will need to make sense but could include column renaming, creating a subset of the data, replacing values, or creating new columns with derived data (for example – if it makes sense you could sum two columns together)
colnames(urldata)<-c("X","Year","%1D", "%2D","%3D", "%4D", "%5D","Vol","%Today","Dir")
head(urldata)
##   X Year    %1D    %2D    %3D    %4D    %5D    Vol %Today  Dir
## 1 1 2001  0.381 -0.192 -2.624 -1.055  5.010 1.1913  0.959   Up
## 2 2 2001  0.959  0.381 -0.192 -2.624 -1.055 1.2965  1.032   Up
## 3 3 2001  1.032  0.959  0.381 -0.192 -2.624 1.4112 -0.623 Down
## 4 4 2001 -0.623  1.032  0.959  0.381 -0.192 1.2760  0.614   Up
## 5 5 2001  0.614 -0.623  1.032  0.959  0.381 1.2057  0.213   Up
## 6 6 2001  0.213  0.614 -0.623  1.032  0.959 1.3491  1.392   Up
tail(urldata)
##         X Year    %1D    %2D    %3D    %4D    %5D     Vol %Today  Dir
## 1245 1245 2005  0.252 -0.024 -0.584 -0.285 -0.141 2.06517  0.422   Up
## 1246 1246 2005  0.422  0.252 -0.024 -0.584 -0.285 1.88850  0.043   Up
## 1247 1247 2005  0.043  0.422  0.252 -0.024 -0.584 1.28581 -0.955 Down
## 1248 1248 2005 -0.955  0.043  0.422  0.252 -0.024 1.54047  0.130   Up
## 1249 1249 2005  0.130 -0.955  0.043  0.422  0.252 1.42236 -0.298 Down
## 1250 1250 2005 -0.298  0.130 -0.955  0.043  0.422 1.38254 -0.489 Down

Change to columns name from Lag# columns to more meaningful name that represents the percent return for # Day prior. ex: %1D.

Example of data wrangling is also demonstrated by the creation of 5 subsets by year for new dataframes used above and also renaming the lag fields.

colnames(data2001)<-c("X","Year","%1D", "%2D","%3D", "%4D", "%5D","Vol","%Today","Dir")
head(data2001)
##   X Year    %1D    %2D    %3D    %4D    %5D    Vol %Today  Dir
## 1 1 2001  0.381 -0.192 -2.624 -1.055  5.010 1.1913  0.959   Up
## 2 2 2001  0.959  0.381 -0.192 -2.624 -1.055 1.2965  1.032   Up
## 3 3 2001  1.032  0.959  0.381 -0.192 -2.624 1.4112 -0.623 Down
## 4 4 2001 -0.623  1.032  0.959  0.381 -0.192 1.2760  0.614   Up
## 5 5 2001  0.614 -0.623  1.032  0.959  0.381 1.2057  0.213   Up
## 6 6 2001  0.213  0.614 -0.623  1.032  0.959 1.3491  1.392   Up
colnames(data2002)<-c("X","Year","%1D", "%2D","%3D", "%4D", "%5D","Vol","%Today","Dir")
head(data2002)
##       X Year    %1D    %2D    %3D    %4D    %5D    Vol %Today  Dir
## 243 243 2002 -1.115  0.336  0.675  0.412 -0.021 0.9436  0.574   Up
## 244 244 2002  0.574 -1.115  0.336  0.675  0.412 1.1710  0.918   Up
## 245 245 2002  0.918  0.574 -1.115  0.336  0.675 1.3989  0.621   Up
## 246 246 2002  0.621  0.918  0.574 -1.115  0.336 1.5130 -0.650 Down
## 247 247 2002 -0.650  0.621  0.918  0.574 -1.115 1.3083 -0.359 Down
## 248 248 2002 -0.359 -0.650  0.621  0.918  0.574 1.2588 -0.480 Down
colnames(data2003)<-c("X","Year","%1D", "%2D","%3D", "%4D", "%5D","Vol","%Today","Dir")
head(data2003)
##       X Year    %1D    %2D    %3D    %4D    %5D    Vol %Today  Dir
## 495 495 2003  0.049  0.456 -1.603 -0.315 -0.547 1.0885  3.320   Up
## 496 496 2003  3.320  0.049  0.456 -1.603 -0.315 1.2292 -0.048 Down
## 497 497 2003 -0.048  3.320  0.049  0.456 -1.603 1.1308  2.247   Up
## 498 498 2003  2.247 -0.048  3.320  0.049  0.456 1.4359 -0.654 Down
## 499 499 2003 -0.654  2.247 -0.048  3.320  0.049 1.5452 -1.409 Down
## 500 500 2003 -1.409 -0.654  2.247 -0.048  3.320 1.4676  1.939   Up
colnames(data2004)<-c("X","Year","%1D", "%2D","%3D", "%4D", "%5D","Vol","%Today","Dir")
head(data2004)
##       X Year    %1D    %2D    %3D    %4D    %5D    Vol %Today  Dir
## 747 747 2004  0.205  0.014  1.240  0.169 -0.181 1.0275 -0.309 Down
## 748 748 2004 -0.309  0.205  0.014  1.240  0.169 1.1532  1.240   Up
## 749 749 2004  1.240 -0.309  0.205  0.014  1.240 1.5782  0.129   Up
## 750 750 2004  0.129  1.240 -0.309  0.205  0.014 1.4945  0.237   Up
## 751 751 2004  0.237  0.129  1.240 -0.309  0.205 1.7049  0.496   Up
## 752 752 2004  0.496  0.237  0.129  1.240 -0.309 1.8684 -0.889 Down
colnames(data2005)<-c("X","Year","%1D", "%2D","%3D", "%4D", "%5D","Vol","%Today","Dir")
head(data2005)
##         X Year    %1D    %2D    %3D    %4D    %5D    Vol %Today  Dir
## 999   999 2005 -0.134  0.008 -0.007  0.715 -0.431 0.7869 -0.812 Down
## 1000 1000 2005 -0.812 -0.134  0.008 -0.007  0.715 1.5108 -1.167 Down
## 1001 1001 2005 -1.167 -0.812 -0.134  0.008 -0.007 1.7210 -0.363 Down
## 1002 1002 2005 -0.363 -1.167 -0.812 -0.134  0.008 1.7389  0.351   Up
## 1003 1003 2005  0.351 -0.363 -1.167 -0.812 -0.134 1.5691 -0.143 Down
## 1004 1004 2005 -0.143  0.351 -0.363 -1.167 -0.812 1.4779  0.342   Up
  1. Graphics: Please make sure to display at least one scatter plot, box plot and histogram. Don’t be limited to this. Please explore the many other options in R packages such as ggplot2.
boxplot(data2001$'%Today')

hist(data2001$'%Today')

qplot(Vol, '%Today', data=data2001)

boxplot(data2002$'%Today')

hist(data2002$'%Today')

qplot(Vol, '%Today', data=data2002)

boxplot(data2003$'%Today')

hist(data2003$'%Today')

qplot(Vol, '%Today', data=data2003)

boxplot(data2004$'%Today')

hist(data2004$'%Today')

qplot(Vol, '%Today', data=data2004)

boxplot(data2005$'%Today')

hist(data2005$'%Today')

qplot(Vol, '%Today', data=data2005)

boxplot(urldata$'%Today')

hist(urldata$'%Today')

qplot(Vol, '%Today', data=urldata)

  1. Meaningful question for analysis: Please state at the beginning a meaningful question for analysis. Use the first three steps and anything else that would be helpful to answer the question you are posing from the data set you chose. Please write a brief conclusion paragraph in R markdown at the end.

What was the largest Percent day change- Up and Down? Which year did it occur? Visually display %today? (Hint graph)

str(urldata)
## 'data.frame':    1250 obs. of  10 variables:
##  $ X     : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Year  : int  2001 2001 2001 2001 2001 2001 2001 2001 2001 2001 ...
##  $ %1D   : num  0.381 0.959 1.032 -0.623 0.614 ...
##  $ %2D   : num  -0.192 0.381 0.959 1.032 -0.623 ...
##  $ %3D   : num  -2.624 -0.192 0.381 0.959 1.032 ...
##  $ %4D   : num  -1.055 -2.624 -0.192 0.381 0.959 ...
##  $ %5D   : num  5.01 -1.055 -2.624 -0.192 0.381 ...
##  $ Vol   : num  1.19 1.3 1.41 1.28 1.21 ...
##  $ %Today: num  0.959 1.032 -0.623 0.614 0.213 ...
##  $ Dir   : Factor w/ 2 levels "Down","Up": 2 2 1 2 2 2 1 2 2 2 ...
urldata[which.max(urldata$'%Today'),]
##       X Year    %1D    %2D    %3D    %4D   %5D     Vol %Today Dir
## 383 383 2002 -2.702 -3.291 -3.835 -2.702 0.566 2.44102  5.733  Up
urldata[which.min(urldata$'%Today'),]
##       X Year   %1D    %2D    %3D    %4D    %5D    Vol %Today  Dir
## 169 169 2001 0.623 -1.864 -2.239 -0.106 -0.056 1.2766 -4.922 Down
#urldata
hist(urldata$'%Today')

Answer:

The largest up % change was 5.733 in 2002

The largest down % was -4.922 in 2001.

  1. BONUS – place the original .csv in a github file and have R read from the link. This will be a very useful skill as you progress in your data science education and career.

Answer: Please see beginning of code - it is pulling from github using: urldata <- read.csv(“https://raw.githubusercontent.com/jrovalino/RWk3FinalProject/master/Smarket.csv”, header = TRUE, sep = “,”)