Homework 1

1) a) V1, V2 & V3 are quantitative, V4 & V5 are qualitative. For a quantitative variable, statistical operations such as Median, mean, quantile are possible as the variable is numeric where as in a ordered category or a qualitative variable its not possible. Summary() tells clearly that v1,v2,v3 are numeric and v4 and v5 have levels such as 0 ,10, 100, 110…etc.

inputfile<-read.csv("/home/archana/ML works_ucsc/HW01pb1data.csv",header = F)
head(inputfile)

##   V1 V2  V3 V4 V5
## 1  0  0   0 10  0
## 2 10  0  10  0 10
## 3 30  0  40 50 20
## 4  0 10  10 10 20
## 5 20 50  10 20 40
## 6 10  0 100  0 10

summary(inputfile)

##        V1              V2              V3              V4     
##  Min.   :  0.0   Min.   :  0.0   Min.   :  0.0   0      :223  
##  1st Qu.:  0.0   1st Qu.:  0.0   1st Qu.:  5.0   10     :211  
##  Median : 10.0   Median : 10.0   Median : 10.0   5      :148  
##  Mean   : 12.4   Mean   : 11.6   Mean   : 11.6   20     : 82  
##  3rd Qu.: 10.0   3rd Qu.: 10.0   3rd Qu.: 10.0   15     : 37  
##  Max.   :270.0   Max.   :130.0   Max.   :180.0   30     : 30  
##                                                  (Other): 69  
##        V5     
##  10     :253  
##  0      :200  
##  5      :149  
##  20     : 78  
##  15     : 35  
##  30     : 22  
##  (Other): 63

levels(inputfile$V4)

##  [1] "0"           "10"          "100"         "110"         "120"        
##  [6] "140"         "15"          "150"         "160"         "20"         
## [11] "200"         "25"          "30"          "35"          "40"         
## [16] "5"           "50"          "55"          "60"          "65"         
## [21] "70"          "80"          "85"          "90"          "thirty five"

Yes. V4 and V5 are so much like a numeric variable. However there are a lot of factors that have made them behave more like a qualitative variable than numeric. Reading the file as -inputfile<-read.csv(“/home/archana/ML works_ucsc/HW01pb1data.csv”,header = F). In R implicitly strings are read a factors and as V4 and V5 where characters by type before instead of numeric, R has read it as categorical variable. -This can be changed by reading input file as
inputfile<-read.csv(“/home/archana/ML works_ucsc/HW01pb1data.csv”,header = F,stringsAsFactor=F)

inputfile_1<-read.csv("/home/archana/ML works_ucsc/HW01pb1data.csv",header = F,stringsAsFactor = F)
summary(inputfile_1)

##        V1              V2              V3             V4           
##  Min.   :  0.0   Min.   :  0.0   Min.   :  0.0   Length:800        
##  1st Qu.:  0.0   1st Qu.:  0.0   1st Qu.:  5.0   Class :character  
##  Median : 10.0   Median : 10.0   Median : 10.0   Mode  :character  
##  Mean   : 12.4   Mean   : 11.6   Mean   : 11.6                     
##  3rd Qu.: 10.0   3rd Qu.: 10.0   3rd Qu.: 10.0                     
##  Max.   :270.0   Max.   :130.0   Max.   :180.0                     
##       V5           
##  Length:800        
##  Class :character  
##  Mode  :character  
##                    
##                    
##

inputfile_1$V4 <-strtoi(inputfile_1$V4)
inputfile_1$V5<- strtoi(inputfile_1$V5)
summary(inputfile_1)

##        V1              V2              V3              V4       
##  Min.   :  0.0   Min.   :  0.0   Min.   :  0.0   Min.   :  0.0  
##  1st Qu.:  0.0   1st Qu.:  0.0   1st Qu.:  5.0   1st Qu.:  0.0  
##  Median : 10.0   Median : 10.0   Median : 10.0   Median : 10.0  
##  Mean   : 12.4   Mean   : 11.6   Mean   : 11.6   Mean   : 12.7  
##  3rd Qu.: 10.0   3rd Qu.: 10.0   3rd Qu.: 10.0   3rd Qu.: 15.0  
##  Max.   :270.0   Max.   :130.0   Max.   :180.0   Max.   :200.0  
##                                                  NA's   :1      
##        V5       
##  Min.   :  0.0  
##  1st Qu.:  2.5  
##  Median : 10.0  
##  Mean   : 11.6  
##  3rd Qu.: 10.0  
##  Max.   :255.0  
##  NA's   :1

Because inputfile[,1] is not a factor which can be categorical, it is just a set of integer values that are scatteringly plotted against the index values that is in total of 800 values in the dataset. However, when plotting for inputfile[,4] where V4 is a categorical variable each of these 800 values are categorized and plot on specific value.

plot of chunk unnamed-chunk-3

## [1] FALSE

plot of chunk unnamed-chunk-3

## [1] TRUE

V6 is also a categorical variable after importing the data in R as it is derived froma categorical variable.

inputfile_2<-read.csv("/home/archana/ML works_ucsc/HW01pb1data1.csv",header = F)
head(inputfile_2)

##   V1 V2  V3 V4 V5 V6
## 1  0  0   0 10  0 20
## 2 10  0  10  0 10 10
## 3 30  0  40 50 20 60
## 4  0 10  10 10 20 20
## 5 20 50  10 20 40 30
## 6 10  0 100  0 10 10

summary(inputfile_2)

##        V1              V2              V3              V4     
##  Min.   :  0.0   Min.   :  0.0   Min.   :  0.0   0      :223  
##  1st Qu.:  0.0   1st Qu.:  0.0   1st Qu.:  5.0   10     :211  
##  Median : 10.0   Median : 10.0   Median : 10.0   5      :148  
##  Mean   : 12.4   Mean   : 11.6   Mean   : 11.6   20     : 82  
##  3rd Qu.: 10.0   3rd Qu.: 10.0   3rd Qu.: 10.0   15     : 37  
##  Max.   :270.0   Max.   :130.0   Max.   :180.0   30     : 30  
##                                                  (Other): 69  
##        V5            V6     
##  10     :253   10     :223  
##  0      :200   20     :211  
##  5      :149   15     :148  
##  20     : 78   30     : 82  
##  15     : 35   25     : 37  
##  30     : 22   40     : 30  
##  (Other): 63   (Other): 69

This question uses the data in the file HW01pb2data.csv. Download it to your computer

datafile <- read.csv("/home/archana/ML works_ucsc/HW01pb2data.csv",header=F)
head(datafile)

##       V1
## 1  9.031
## 2 11.215
## 3  7.390
## 4  7.217
## 5 10.417
## 6  6.277

sample_datafile<-sample(datafile$V1,10000,replace=T)
head(sample_datafile)

## [1] 10.405  9.864  4.857 10.318  9.813  7.935

mean(sample_datafile)

## [1] 9.454

max(sample_datafile)

## [1] 16.71

var(sample_datafile)

## [1] 3.956

quantile(sample_datafile,.25)

##   25% 
## 8.108

summary(sample_datafile)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.46    8.11    9.45    9.45   10.80   16.70

hist(sample_datafile)

plot of chunk unnamed-chunk-6

The summary of the sample and the population are always approximately close. This shows that the random sample is not a biased sample from the population.

mean(datafile$V1)

## [1] 9.451

max(datafile$V1)

## [1] 18.97

var(datafile$V1)

## [1] 4.002

quantile(datafile$V1,.25)

##   25% 
## 8.104

summary(datafile$V1)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -0.112   8.100   9.450   9.450  10.800  19.000

hist(datafile$V1)

plot of chunk unnamed-chunk-7

3 a) Desert house box plot interpretation:

The box plot is skewed right leading to a lot of outliers in the data (the point circles are the outliers.) and it clearly shows the median (the solid line in the box) is approximately 90 and the quartile 1 and 3 are approx. 50 and 180. Using boxplot we can also conclude the range of the data. Here its approximately close to 0 to 2700. Therefore, we may say that the data that we are looking at is biased.

Ocean house box plot interpretation:

The box plot is symmetric and there are few outliers that can be removed. The median is approx 1600 and Q1 and Q3 being 1400 and 1700 approximately. Range of the dataset can also be calculated 800 to 2400 approximately.

ocean_houses<-read.csv("/home/archana/ML works_ucsc/HW01pb3OceanViewdata.csv",header=F)
desert_houses<-read.csv("/home/archana/ML works_ucsc/HW01pb3Desertdata.csv",header=F)
boxplot(desert_houses,main="Desert Houses Box Plot")

plot of chunk unnamed-chunk-8

boxplot(ocean_houses,main="Ocean Houses Box Plot")

plot of chunk unnamed-chunk-8

Histogram with default intervals and with the interval of width from 0 to $3 million

hist(ocean_houses$V1)

plot of chunk unnamed-chunk-9

hist(ocean_houses$V1,breaks=seq(0,3000,by=500),main="Histogram of Ocean houses",xlab = "Ocean house price in Thousand of dollars")

plot of chunk unnamed-chunk-9

plotting ECFD of desert and ocean houses in a single graph

plot(ecdf(ocean_houses[,1]),xlim=c(0,2500),verticals=T,col="blue", main="Empirical Cumulative Distribution for Houses",xlab="Price in Thousand Dollars",ylab="ECDF")
lines(ecdf(desert_houses[,1]),verticals=T,col="black") 
legend(1800,0.8,c("Desert houses","Ocean houses"),lty=c(1,1),col=c("black","blue"))

plot of chunk unnamed-chunk-10 4 a)

orange<-as.data.frame(Orange)
plot(orange$age, orange$circumference, xlab="Age of Tree",ylab="Trunk Circumference in mm", pch=20,main="Circumference vs. Age for Trees",col=orange$Tree)
legend('bottomright', legend = levels(factor(orange$Tree)),text.col=seq_along(levels(orange$Tree)), title="Tree Type")

plot of chunk unnamed-chunk-11 b)

orange_1<-orange[which(orange$Tree==1),]
cor(orange_1$age,orange_1$circumference)

## [1] 0.9855

tree_levels<-sort(levels(orange$Tree))
tab_stat<-data.frame(matrix(nrow=length(tree_levels),ncol=0))
tab_stat$Tree<-tree_levels
tab_stat$COVARIANCE<-as.matrix(by(orange, orange$Tree, function(x){cov(x$age,x$circumference)}))
tab_stat$CORRELATION<-as.matrix(by(orange, orange$Tree,function(x){cor(x$age,x$circumference)}))
tab_stat

##   Tree COVARIANCE CORRELATION
## 1    1      22240      0.9882
## 2    2      22340      0.9855
## 3    3      30443      0.9877
## 4    4      34290      0.9874
## 5    5      37063      0.9845

cov and cor remains the same

tab_stat$COVARIANCE<-as.matrix(by(orange, orange$Tree, function(x){cov(x$age,x$circumference+10)}))
tab_stat$CORRELATION<-as.matrix(by(orange, orange$Tree,function(x){cor(x$age,x$circumference+10)}))
tab_stat

##   Tree COVARIANCE CORRELATION
## 1    1      22240      0.9882
## 2    2      22340      0.9855
## 3    3      30443      0.9877
## 4    4      34290      0.9874
## 5    5      37063      0.9845

cov doubles and cor remains the same

tab_stat$COVARIANCE<-as.matrix(by(orange, orange$Tree, function(x){cov(x$age,x$circumference*2)}))
tab_stat$CORRELATION<-as.matrix(by(orange, orange$Tree,function(x){cor(x$age,x$circumference*2)}))
tab_stat

##   Tree COVARIANCE CORRELATION
## 1    1      44480      0.9882
## 2    2      44680      0.9855
## 3    3      60886      0.9877
## 4    4      68581      0.9874
## 5    5      74125      0.9845

the new cov is -2 times the old cor is negative and same as the old cor

tab_stat$COVARIANCE<-as.matrix(by(orange, orange$Tree, function(x){cov(x$age,x$circumference*-2)}))
tab_stat$CORRELATION<-as.matrix(by(orange, orange$Tree,function(x){cor(x$age,x$circumference*-2)}))
tab_stat

##   Tree COVARIANCE CORRELATION
## 1    1     -44480     -0.9882
## 2    2     -44680     -0.9855
## 3    3     -60886     -0.9877
## 4    4     -68581     -0.9874
## 5    5     -74125     -0.9845

5 a) Median is less than mean b) Data is right skewed when median is less than mean c) when adding 10 (thousand dollaras) median also increases by 10k d) when doubled the median doubles too

str(desert_houses)

## 'data.frame':    5000 obs. of  1 variable:
##  $ V1: int  93 51 89 83 56 27 10 58 48 19 ...

head(desert_houses)

##   V1
## 1 93
## 2 51
## 3 89
## 4 83
## 5 56
## 6 27

summary(desert_houses)

##        V1      
##  Min.   :  10  
##  1st Qu.:  51  
##  Median :  89  
##  Mean   : 144  
##  3rd Qu.: 172  
##  Max.   :2654

hist(desert_houses$V1)

plot of chunk unnamed-chunk-17

median(desert_houses[,1]+10)

## [1] 99

median(desert_houses[,1]*2)

## [1] 178