1) a) V1, V2 & V3 are quantitative, V4 & V5 are qualitative. For a quantitative variable, statistical operations such as Median, mean, quantile are possible as the variable is numeric where as in a ordered category or a qualitative variable its not possible. Summary() tells clearly that v1,v2,v3 are numeric and v4 and v5 have levels such as 0 ,10, 100, 110…etc.
inputfile<-read.csv("/home/archana/ML works_ucsc/HW01pb1data.csv",header = F)
head(inputfile)
## V1 V2 V3 V4 V5
## 1 0 0 0 10 0
## 2 10 0 10 0 10
## 3 30 0 40 50 20
## 4 0 10 10 10 20
## 5 20 50 10 20 40
## 6 10 0 100 0 10
summary(inputfile)
## V1 V2 V3 V4
## Min. : 0.0 Min. : 0.0 Min. : 0.0 0 :223
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 5.0 10 :211
## Median : 10.0 Median : 10.0 Median : 10.0 5 :148
## Mean : 12.4 Mean : 11.6 Mean : 11.6 20 : 82
## 3rd Qu.: 10.0 3rd Qu.: 10.0 3rd Qu.: 10.0 15 : 37
## Max. :270.0 Max. :130.0 Max. :180.0 30 : 30
## (Other): 69
## V5
## 10 :253
## 0 :200
## 5 :149
## 20 : 78
## 15 : 35
## 30 : 22
## (Other): 63
levels(inputfile$V4)
## [1] "0" "10" "100" "110" "120"
## [6] "140" "15" "150" "160" "20"
## [11] "200" "25" "30" "35" "40"
## [16] "5" "50" "55" "60" "65"
## [21] "70" "80" "85" "90" "thirty five"
inputfile_1<-read.csv("/home/archana/ML works_ucsc/HW01pb1data.csv",header = F,stringsAsFactor = F)
summary(inputfile_1)
## V1 V2 V3 V4
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Length:800
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 5.0 Class :character
## Median : 10.0 Median : 10.0 Median : 10.0 Mode :character
## Mean : 12.4 Mean : 11.6 Mean : 11.6
## 3rd Qu.: 10.0 3rd Qu.: 10.0 3rd Qu.: 10.0
## Max. :270.0 Max. :130.0 Max. :180.0
## V5
## Length:800
## Class :character
## Mode :character
##
##
##
inputfile_1$V4 <-strtoi(inputfile_1$V4)
inputfile_1$V5<- strtoi(inputfile_1$V5)
summary(inputfile_1)
## V1 V2 V3 V4
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 5.0 1st Qu.: 0.0
## Median : 10.0 Median : 10.0 Median : 10.0 Median : 10.0
## Mean : 12.4 Mean : 11.6 Mean : 11.6 Mean : 12.7
## 3rd Qu.: 10.0 3rd Qu.: 10.0 3rd Qu.: 10.0 3rd Qu.: 15.0
## Max. :270.0 Max. :130.0 Max. :180.0 Max. :200.0
## NA's :1
## V5
## Min. : 0.0
## 1st Qu.: 2.5
## Median : 10.0
## Mean : 11.6
## 3rd Qu.: 10.0
## Max. :255.0
## NA's :1
## [1] FALSE
## [1] TRUE
inputfile_2<-read.csv("/home/archana/ML works_ucsc/HW01pb1data1.csv",header = F)
head(inputfile_2)
## V1 V2 V3 V4 V5 V6
## 1 0 0 0 10 0 20
## 2 10 0 10 0 10 10
## 3 30 0 40 50 20 60
## 4 0 10 10 10 20 20
## 5 20 50 10 20 40 30
## 6 10 0 100 0 10 10
summary(inputfile_2)
## V1 V2 V3 V4
## Min. : 0.0 Min. : 0.0 Min. : 0.0 0 :223
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 5.0 10 :211
## Median : 10.0 Median : 10.0 Median : 10.0 5 :148
## Mean : 12.4 Mean : 11.6 Mean : 11.6 20 : 82
## 3rd Qu.: 10.0 3rd Qu.: 10.0 3rd Qu.: 10.0 15 : 37
## Max. :270.0 Max. :130.0 Max. :180.0 30 : 30
## (Other): 69
## V5 V6
## 10 :253 10 :223
## 0 :200 20 :211
## 5 :149 15 :148
## 20 : 78 30 : 82
## 15 : 35 25 : 37
## 30 : 22 40 : 30
## (Other): 63 (Other): 69
datafile <- read.csv("/home/archana/ML works_ucsc/HW01pb2data.csv",header=F)
head(datafile)
## V1
## 1 9.031
## 2 11.215
## 3 7.390
## 4 7.217
## 5 10.417
## 6 6.277
sample_datafile<-sample(datafile$V1,10000,replace=T)
head(sample_datafile)
## [1] 10.405 9.864 4.857 10.318 9.813 7.935
b)
mean(sample_datafile)
## [1] 9.454
max(sample_datafile)
## [1] 16.71
var(sample_datafile)
## [1] 3.956
quantile(sample_datafile,.25)
## 25%
## 8.108
summary(sample_datafile)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.46 8.11 9.45 9.45 10.80 16.70
hist(sample_datafile)
mean(datafile$V1)
## [1] 9.451
max(datafile$V1)
## [1] 18.97
var(datafile$V1)
## [1] 4.002
quantile(datafile$V1,.25)
## 25%
## 8.104
summary(datafile$V1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.112 8.100 9.450 9.450 10.800 19.000
hist(datafile$V1)
3 a) Desert house box plot interpretation:
The box plot is skewed right leading to a lot of outliers in the data (the point circles are the outliers.) and it clearly shows the median (the solid line in the box) is approximately 90 and the quartile 1 and 3 are approx. 50 and 180. Using boxplot we can also conclude the range of the data. Here its approximately close to 0 to 2700. Therefore, we may say that the data that we are looking at is biased.
Ocean house box plot interpretation:
The box plot is symmetric and there are few outliers that can be removed. The median is approx 1600 and Q1 and Q3 being 1400 and 1700 approximately. Range of the dataset can also be calculated 800 to 2400 approximately.
ocean_houses<-read.csv("/home/archana/ML works_ucsc/HW01pb3OceanViewdata.csv",header=F)
desert_houses<-read.csv("/home/archana/ML works_ucsc/HW01pb3Desertdata.csv",header=F)
boxplot(desert_houses,main="Desert Houses Box Plot")
boxplot(ocean_houses,main="Ocean Houses Box Plot")
hist(ocean_houses$V1)
hist(ocean_houses$V1,breaks=seq(0,3000,by=500),main="Histogram of Ocean houses",xlab = "Ocean house price in Thousand of dollars")
plot(ecdf(ocean_houses[,1]),xlim=c(0,2500),verticals=T,col="blue", main="Empirical Cumulative Distribution for Houses",xlab="Price in Thousand Dollars",ylab="ECDF")
lines(ecdf(desert_houses[,1]),verticals=T,col="black")
legend(1800,0.8,c("Desert houses","Ocean houses"),lty=c(1,1),col=c("black","blue"))
4 a)
orange<-as.data.frame(Orange)
plot(orange$age, orange$circumference, xlab="Age of Tree",ylab="Trunk Circumference in mm", pch=20,main="Circumference vs. Age for Trees",col=orange$Tree)
legend('bottomright', legend = levels(factor(orange$Tree)),text.col=seq_along(levels(orange$Tree)), title="Tree Type")
b)
orange_1<-orange[which(orange$Tree==1),]
cor(orange_1$age,orange_1$circumference)
## [1] 0.9855
c)
tree_levels<-sort(levels(orange$Tree))
tab_stat<-data.frame(matrix(nrow=length(tree_levels),ncol=0))
tab_stat$Tree<-tree_levels
tab_stat$COVARIANCE<-as.matrix(by(orange, orange$Tree, function(x){cov(x$age,x$circumference)}))
tab_stat$CORRELATION<-as.matrix(by(orange, orange$Tree,function(x){cor(x$age,x$circumference)}))
tab_stat
## Tree COVARIANCE CORRELATION
## 1 1 22240 0.9882
## 2 2 22340 0.9855
## 3 3 30443 0.9877
## 4 4 34290 0.9874
## 5 5 37063 0.9845
tab_stat$COVARIANCE<-as.matrix(by(orange, orange$Tree, function(x){cov(x$age,x$circumference+10)}))
tab_stat$CORRELATION<-as.matrix(by(orange, orange$Tree,function(x){cor(x$age,x$circumference+10)}))
tab_stat
## Tree COVARIANCE CORRELATION
## 1 1 22240 0.9882
## 2 2 22340 0.9855
## 3 3 30443 0.9877
## 4 4 34290 0.9874
## 5 5 37063 0.9845
tab_stat$COVARIANCE<-as.matrix(by(orange, orange$Tree, function(x){cov(x$age,x$circumference*2)}))
tab_stat$CORRELATION<-as.matrix(by(orange, orange$Tree,function(x){cor(x$age,x$circumference*2)}))
tab_stat
## Tree COVARIANCE CORRELATION
## 1 1 44480 0.9882
## 2 2 44680 0.9855
## 3 3 60886 0.9877
## 4 4 68581 0.9874
## 5 5 74125 0.9845
tab_stat$COVARIANCE<-as.matrix(by(orange, orange$Tree, function(x){cov(x$age,x$circumference*-2)}))
tab_stat$CORRELATION<-as.matrix(by(orange, orange$Tree,function(x){cor(x$age,x$circumference*-2)}))
tab_stat
## Tree COVARIANCE CORRELATION
## 1 1 -44480 -0.9882
## 2 2 -44680 -0.9855
## 3 3 -60886 -0.9877
## 4 4 -68581 -0.9874
## 5 5 -74125 -0.9845
5 a) Median is less than mean b) Data is right skewed when median is less than mean c) when adding 10 (thousand dollaras) median also increases by 10k d) when doubled the median doubles too
str(desert_houses)
## 'data.frame': 5000 obs. of 1 variable:
## $ V1: int 93 51 89 83 56 27 10 58 48 19 ...
head(desert_houses)
## V1
## 1 93
## 2 51
## 3 89
## 4 83
## 5 56
## 6 27
summary(desert_houses)
## V1
## Min. : 10
## 1st Qu.: 51
## Median : 89
## Mean : 144
## 3rd Qu.: 172
## Max. :2654
hist(desert_houses$V1)
median(desert_houses[,1]+10)
## [1] 99
median(desert_houses[,1]*2)
## [1] 178