# ```{r results='hide'}
## Some house keep jobs:
# Set working directory:
rm(list=ls())
# a):
setwd("C:/Users/Andrew/SkyDrive/AGZ_Home/workspace_R/UCSC/MachinLearning")
# Read the file:
myData<-read.csv("All_data/HW01pb1data.csv",header=FALSE) # Default: header=TRUE
#verify:
head(myData)
## V1 V2 V3 V4 V5
## 1 0 0 0 10 0
## 2 10 0 10 0 10
## 3 30 0 40 50 20
## 4 0 10 10 10 20
## 5 20 50 10 20 40
## 6 10 0 100 0 10
names(myData)
## [1] "V1" "V2" "V3" "V4" "V5"
str(myData)
## 'data.frame': 800 obs. of 5 variables:
## $ V1: int 0 10 30 0 20 10 10 50 0 30 ...
## $ V2: int 0 0 0 10 50 0 0 10 10 20 ...
## $ V3: int 0 10 40 10 10 100 0 10 0 10 ...
## $ V4: Factor w/ 25 levels "0","10","100",..: 2 1 17 2 10 1 2 2 13 10 ...
## $ V5: Factor w/ 19 levels "0","10","120",..: 1 2 6 6 11 2 2 6 2 11 ...
dim(myData)
## [1] 800 5
# b):
# For coloumn V4 and V5 are facter, which are failed to be calculated in R; but it works fine in Excel.
# c):
plot(myData[,1])
# Column 1 is numeric data, it show the distribution of the numbers of the interger from 0 to 800.
plot(myData[,4])
# Column 4 is categorical data, it show the distribution of the numbers of the categorical data from 0 to 800; it's a bar chart!
# d)
# 1) Method One: Manually saved from execl as "HW01pb1data_1.csv"
# As mentioned in b): it's possible to do in Excel.
myData1<-read.csv("All_data/HW01pb1data_1.csv",header=FALSE)
#verify:
head(myData1)
## V1 V2 V3 V4 V5 V6
## 1 0 0 0 10 0 20
## 2 10 0 10 0 10 10
## 3 30 0 40 50 20 60
## 4 0 10 10 10 20 20
## 5 20 50 10 20 40 30
## 6 10 0 100 0 10 10
names(myData1)
## [1] "V1" "V2" "V3" "V4" "V5" "V6"
str(myData1)
## 'data.frame': 800 obs. of 6 variables:
## $ V1: int 0 10 30 0 20 10 10 50 0 30 ...
## $ V2: int 0 0 0 10 50 0 0 10 10 20 ...
## $ V3: int 0 10 40 10 10 100 0 10 0 10 ...
## $ V4: Factor w/ 25 levels "0","10","100",..: 2 1 17 2 10 1 2 2 13 10 ...
## $ V5: Factor w/ 19 levels "0","10","120",..: 1 2 6 6 11 2 2 6 2 11 ...
## $ V6: Factor w/ 25 levels "#VALUE!","10",..: 11 2 19 11 14 2 11 11 16 14 ...
# The new column is still the catogorical!
plot(myData1[,6])
# 2) Method Two: Use "dplyr" approach
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.1.2
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
MyData_add_column7 <- myData1 %>% mutate(V7 = V4 + 10)
## Warning in Ops.factor(structure(c(2L, 1L, 17L, 2L, 10L, 1L, 2L, 2L, 13L, :
## + not meaningful for factors
head(MyData_add_column7)
## V1 V2 V3 V4 V5 V6 V7
## 1 0 0 0 10 0 20 NA
## 2 10 0 10 0 10 10 NA
## 3 30 0 40 50 20 60 NA
## 4 0 10 10 10 20 20 NA
## 5 20 50 10 20 40 30 NA
## 6 10 0 100 0 10 10 NA
# Failed to add 10, is it because the column 4 (V4) is not numeric!
# if add 10 to column 2:
MyData_add_column8 <- myData1 %>% mutate(V8 = V2 + 10)
head(MyData_add_column8)
## V1 V2 V3 V4 V5 V6 V8
## 1 0 0 0 10 0 20 10
## 2 10 0 10 0 10 10 10
## 3 30 0 40 50 20 60 10
## 4 0 10 10 10 20 20 20
## 5 20 50 10 20 40 30 60
## 6 10 0 100 0 10 10 10
# Yes, it's ok now:
#3) using R built-in function to verify above result:
myData1$V9 <- myData1$V4 + 10
## Warning in Ops.factor(myData1$V4, 10): + not meaningful for factors
head(myData1$V9)
## [1] NA NA NA NA NA NA
# Add 10 to column 2, which is numeric:
myData1$V10 <- myData1$V2 + 10
head(myData1$V10)
## [1] 10 10 10 20 60 10
# Yes, it works fine for numeric adding.
# Conclusiong: Excel is able to add a number to a catagorical number, but R function does not allow.
# why?
# a)
myData <- read.csv("C:/Users/Andrew/SkyDrive/AGZ_Home/workspace_R/UCSC/MachinLearning/All_data/HW01pb2data.csv",header=FALSE)
#verify:
head(myData)
## V1
## 1 9.031373
## 2 11.215397
## 3 7.389689
## 4 7.217091
## 5 10.416985
## 6 6.277325
str(myData)
## 'data.frame': 2000000 obs. of 1 variable:
## $ V1: num 9.03 11.22 7.39 7.22 10.42 ...
dim(myData)
## [1] 2000000 1
myData_sample_10000 <- sample_n(myData, 10000) # Sampling 10,000 lines
# b)
max(myData_sample_10000)
## [1] 17.15979
min(myData_sample_10000)
## [1] 0.8979835
var(myData_sample_10000)
## V1
## V1 3.99221
mean(myData_sample_10000)
## Warning in mean.default(myData_sample_10000): argument is not numeric or
## logical: returning NA
## [1] NA
quantile(myData_sample_10000$V1, probs= c(0.25))
## 25%
## 8.105024
# Using dplyr:
myData_sample_10000 %>%
summarise_each(funs(mean(., na.rm=TRUE), min(., na.rm=TRUE), max(., na.rm=TRUE), var(., na.rm=TRUE), quantile(myData_sample_10000$V1, probs= c(0.25)) ) )
## mean min max var quantile
## 1 9.44173 0.8979835 17.15979 3.99221 8.105024
# We mya just use this:
summary(myData_sample_10000)
## V1
## Min. : 0.898
## 1st Qu.: 8.105
## Median : 9.430
## Mean : 9.442
## 3rd Qu.:10.772
## Max. :17.160
# c)
quantile(myData$V1, 0.25)
## 25%
## 8.10388
## The 1st quartile does not seem to be a big different.
# d)
write.csv(myData_sample_10000, file = "C:/Users/Andrew/SkyDrive/AGZ_Home/workspace_R/UCSC/MachinLearning/All_data/HW1myData_sample1_10000.csv")
# e) Open full data set in Excel, only show max line up to: 1,048,876 lines
# R vs Excell functions:
# max: =MAX(B2:B10001)
# min: =MIN(B2:B10001)
# mean: =AVERAGE(B2:B10001)
# var(): =VAR(B2:B10001)
# quantile(,.25): =QUARTILE(B2:B10001,1)
# Read files:
HW1_3_OceanViewdata <- read.csv("C:/Users/Andrew/SkyDrive/AGZ_Home/workspace_R/UCSC/MachinLearning/All_data/HW01pb3OceanViewdata.csv")
HW1_3_Desertdata <- read.csv("C:/Users/Andrew/SkyDrive/AGZ_Home/workspace_R/UCSC/MachinLearning/All_data/HW01pb3Desertdata.csv",header=FALSE)
dim(HW1_3_OceanViewdata)
## [1] 1999 1
dim(HW1_3_Desertdata)
## [1] 5000 1
names(HW1_3_OceanViewdata) # Shows "X787", becasue the default is: "header=TRUE"
## [1] "X787"
names(HW1_3_Desertdata) # Shows "V1", because I set "header=FALSE"
## [1] "V1"
head(HW1_3_OceanViewdata)
## X787
## 1 1052
## 2 1240
## 3 1412
## 4 1545
## 5 1644
## 6 1741
head(HW1_3_Desertdata, 5)
## V1
## 1 93
## 2 51
## 3 89
## 4 83
## 5 56
str(HW1_3_OceanViewdata)
## 'data.frame': 1999 obs. of 1 variable:
## $ X787: int 1052 1240 1412 1545 1644 1741 1761 1703 1604 1498 ...
str(HW1_3_Desertdata)
## 'data.frame': 5000 obs. of 1 variable:
## $ V1: int 93 51 89 83 56 27 10 58 48 19 ...
# a)
boxplot(HW1_3_OceanViewdata,col="blue", main="Ocean View Data Box Plots")
boxplot(HW1_3_Desertdata,col="blue", main="Desert View Data Box Plots")
# Shows the data about: outliter, Max, upper quartile 25%, Mediain, low quartile 25%, Min and outliter
# b)
head(HW1_3_Desertdata)
## V1
## 1 93
## 2 51
## 3 89
## 4 83
## 5 56
## 6 27
head(HW1_3_Desertdata$V1)
## [1] 93 51 89 83 56 27
str(HW1_3_Desertdata)
## 'data.frame': 5000 obs. of 1 variable:
## $ V1: int 93 51 89 83 56 27 10 58 48 19 ...
str(HW1_3_Desertdata$V1)
## int [1:5000] 93 51 89 83 56 27 10 58 48 19 ...
# Failed! Need to do this:
# hist(HW1_3_Desertdata)
# Error in hist.default(HW1_3_Desertdata) : 'x' must be numeric
# Reason: "hist"" is for one vetor, not for data.frame
hist(HW1_3_Desertdata$V1)
# or:
hist(HW1_3_OceanViewdata$X787) # X787 is the coloumn name
hist(HW1_3_OceanViewdata$X787,breaks=seq(from=0,to=3000,by=500),col=c("green","red","blue","yellow","orange"),main="Frequency Histogram of Ocean View Houses",xlab="Ocean View Prices in thousands",ylab = "frequency")
# c)
# Refernce: http://www.r-bloggers.com/exploratory-data-analysis-2-ways-of-plotting-empirical-cumulative-distribution-functions-in-r/
ecdf(HW1_3_OceanViewdata$X787)
## Empirical CDF
## Call: ecdf(HW1_3_OceanViewdata$X787)
## x[1:567] = 787, 1029, 1052, ..., 2133, 2401
ecdf(HW1_3_Desertdata$V1)
## Empirical CDF
## Call: ecdf(HW1_3_Desertdata$V1)
## x[1:423] = 10, 11, 12, ..., 1935, 2654
HW1_3_OceanViewdat_ecdf <- ecdf(HW1_3_OceanViewdata$X787)
HW1_3_OceanViewdat_ecdf
## Empirical CDF
## Call: ecdf(HW1_3_OceanViewdata$X787)
## x[1:567] = 787, 1029, 1052, ..., 2133, 2401
HW1_3_Desertdata_ecdf <- ecdf(HW1_3_Desertdata$V1)
str(HW1_3_OceanViewdat_ecdf)
## function (v)
## - attr(*, "class")= chr [1:3] "ecdf" "stepfun" "function"
## - attr(*, "call")= language ecdf(HW1_3_OceanViewdata$X787)
plot(HW1_3_OceanViewdat_ecdf, xlab = 'Oceab View Housing Price in thousand', ylab = 'Cumulative Probabilty', main = 'Empirical Cumluative Distribution\nOcean View Housing Price in thousands')
plot(HW1_3_Desertdata_ecdf, xlab = 'Deserrt Housing Price in thousands', ylab = 'Cumulative Probabilty', main = 'Empirical Cumluative Distribution\nDeserrt Housing Price')
orange <- as.data.frame(Orange)
head(orange)
## Tree age circumference
## 1 1 118 30
## 2 1 484 58
## 3 1 664 87
## 4 1 1004 115
## 5 1 1231 120
## 6 1 1372 142
names(orange)
## [1] "Tree" "age" "circumference"
dim(orange)
## [1] 35 3
str(orange)
## 'data.frame': 35 obs. of 3 variables:
## $ Tree : Ord.factor w/ 5 levels "3"<"1"<"5"<"2"<..: 2 2 2 2 2 2 2 4 4 4 ...
## $ age : num 118 484 664 1004 1231 ...
## $ circumference: num 30 58 87 115 120 142 145 33 69 111 ...
## - attr(*, "formula")=Class 'formula' length 3 circumference ~ age | Tree
## .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv>
## - attr(*, "labels")=List of 2
## ..$ x: chr "Time since December 31, 1968"
## ..$ y: chr "Trunk circumference"
## - attr(*, "units")=List of 2
## ..$ x: chr "(days)"
## ..$ y: chr "(mm)"
# a)
plot(circumference ~ age, data = orange, main = "Scatter Plot: Circumference over Age" )
# b)
# Get the first tree's data set
library(sqldf)
## Warning: package 'sqldf' was built under R version 3.1.2
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 3.1.2
## Loading required package: proto
## Loading required package: RSQLite
## Warning: package 'RSQLite' was built under R version 3.1.2
## Loading required package: DBI
## Warning: package 'DBI' was built under R version 3.1.1
Tree1<-sqldf("SELECT * FROM orange WHERE Tree='1'")
## Loading required package: tcltk
Tree1
## Tree age circumference
## 1 1 118 30
## 2 1 484 58
## 3 1 664 87
## 4 1 1004 115
## 5 1 1231 120
## 6 1 1372 142
## 7 1 1582 145
cor(Tree1$age,Tree1$circumference)
## [1] 0.9854675
# c)
# Method 1: Using individual function of cor and cov, then combine them with function of data.frame()
# cov(age, circumference, Tree='1')
Tree <- c(1,2,3,4,5)
Tree
## [1] 1 2 3 4 5
Tree1 <-subset(orange, orange$Tree == "1")
Tree1
## Tree age circumference
## 1 1 118 30
## 2 1 484 58
## 3 1 664 87
## 4 1 1004 115
## 5 1 1231 120
## 6 1 1372 142
## 7 1 1582 145
cor1=cor(Tree1$age,Tree1$circumference)
cov1=cov(Tree1$age,Tree1$circumference)
Tree2 <-subset(orange, orange$Tree == "2")
cor2=cor(Tree2$age,Tree2$circumference)
cov2=cov(Tree2$age,Tree2$circumference)
Tree3 <-subset(orange, orange$Tree == "3")
cor3=cor(Tree3$age,Tree3$circumference)
cov3=cov(Tree3$age,Tree3$circumference)
Tree4 <-subset(orange, orange$Tree == "4")
cor4=cor(Tree4$age,Tree4$circumference)
cov4=cov(Tree4$age,Tree4$circumference)
Tree5 <-subset(orange, orange$Tree == "5")
cor5=cor(Tree5$age,Tree5$circumference)
cov5=cov(Tree5$age,Tree5$circumference)
COVARIANCE=c(cor1,cor2,cor3,cor4,cor5)
CORRELATION=c(cov1,cov2,cov3,cov4,cov5)
result <- data.frame(Tree,COVARIANCE, CORRELATION)
result
## Tree COVARIANCE CORRELATION
## 1 1 0.9854675 22340.07
## 2 2 0.9873624 34290.45
## 3 3 0.9881766 22239.83
## 4 4 0.9844610 37062.62
## 5 5 0.9877376 30442.81
# Method 2: Use dplyr
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
##
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
ddply(orange, .(Tree), summarize, COVARIANCE = cor(age, circumference),
CORRELATION = cor(age, circumference) )
## Tree COVARIANCE CORRELATION
## 1 3 0.9881766 0.9881766
## 2 1 0.9854675 0.9854675
## 3 5 0.9877376 0.9877376
## 4 2 0.9873624 0.9873624
## 5 4 0.9844610 0.9844610
# a)
mean(HW1_3_Desertdata[,1])
## [1] 144.0348
median(HW1_3_Desertdata[,1])
## [1] 89
sd(HW1_3_Desertdata[,1])
## [1] 179.7074
summary(HW1_3_Desertdata)
## V1
## Min. : 10
## 1st Qu.: 51
## Median : 89
## Mean : 144
## 3rd Qu.: 172
## Max. :2654
# median is smaller than mean
# b)
# Since mean is greater than median, we can infer that the shape of distribution is right-skewed.
# Yes the distribution has more weight towards the left (lower house price).
# Yes there is a long tail towards the right (higher house prices).
# c)1
# Caclulation for adding 10:
HW1_3_Desertdata_10 <- HW1_3_Desertdata + 10
summary(HW1_3_Desertdata_10)
## V1
## Min. : 20
## 1st Qu.: 61
## Median : 99
## Mean : 154
## 3rd Qu.: 182
## Max. :2664
sd(HW1_3_Desertdata_10[,1])
## [1] 179.7074
# Caclulation for * 2:
HW1_3_Desertdata_x2 <- HW1_3_Desertdata * 2
summary(HW1_3_Desertdata_x2)
## V1
## Min. : 20.0
## 1st Qu.: 102.0
## Median : 178.0
## Mean : 288.1
## 3rd Qu.: 344.0
## Max. :5308.0
sd(HW1_3_Desertdata_x2[,1])
## [1] 359.4147
# Comparison of the three histogram:
hist(HW1_3_Desertdata$V1)
hist(HW1_3_Desertdata_10$V1)
hist(HW1_3_Desertdata_x2$V1)