Project 1

Problem 1

by(iris$Sepal.Width, iris$Species, summary)

## iris$Species: setosa
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.300   3.200   3.400   3.428   3.675   4.400 
## -------------------------------------------------------- 
## iris$Species: versicolor
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   2.525   2.800   2.770   3.000   3.400 
## -------------------------------------------------------- 
## iris$Species: virginica
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.200   2.800   3.000   2.974   3.175   3.800

Problem 2

cyl2 = factor(mtcars$cyl)
boxplot(mpg~cyl2, data=mtcars, main="Car MPG", xlab="Number of Cylinders", ylab="MPG")

Problem 3

set.seed(101)
rSamples = replicate(1000, rnorm(100,10,5), simplify=FALSE)
rSamples.means = sapply(rSamples,mean)
mean(rSamples.means)

## [1] 10.0144

sd(rSamples.means)

## [1] 0.4776565

The results are consisent with the Central Limit Theorem. The mean of the 1000 samples is right around 10 with a standard deviation of around .5 which provides a rather normal distrubtion.

Problem 4

geese <- read.csv(file="/Users/Alex/Dropbox/College/4-Senior/Machine Learning/Project1/geese.txt", header=TRUE, sep=",")

par(mfrow=c(1,2))
hist(geese$Aestimate, main="A Estimate")
hist(geese$Bestimate, main="B Estimate")

Problem 5

cs1979 <- read.csv(file="/Users/Alex/Dropbox/College/4-Senior/Machine Learning/Project1/CerealSugar1979.txt", header=FALSE, sep="\t")
names(cs1979)[1]<-paste("Cereal")
names(cs1979)[2]<-paste("SugarContent")

cs2006 <- read.csv(file="/Users/Alex/Dropbox/College/4-Senior/Machine Learning/Project1/CerealSugar2006.txt", header=FALSE, sep="\t")
names(cs2006)[1]<-paste("Cereal")
names(cs2006)[2]<-paste("SugarContent")

t.test(cs1979$SugarContent, cs2006$SugarContent)

## 
##  Welch Two Sample t-test
## 
## data:  cs1979$SugarContent and cs2006$SugarContent
## t = -0.42044, df = 108.99, p-value = 0.675
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -6.861317  4.459737
## sample estimates:
## mean of x mean of y 
##  26.76452  27.96531

Problem 6

retail = read.table(file="/Users/Alex/Dropbox/College/4-Senior/Machine Learning/Project1/retail.txt", header=TRUE, sep="\t")

grossSalesPredictor <- lm(Gross.Sales~Gross.Cash+Cash.Items+Gross.Check, retail)
grossSalesPredictor

## 
## Call:
## lm(formula = Gross.Sales ~ Gross.Cash + Cash.Items + Gross.Check, 
##     data = retail)
## 
## Coefficients:
## (Intercept)   Gross.Cash   Cash.Items  Gross.Check  
##   -12.92979      0.08669      7.62916      1.16480

From this Linear Model the following equation can be created: \[Gross Sales = -12.93 + 0.087(Gross Cash) + 7.629(Cash Items) + 1.165(Gross Check)\]

Problem 7

kudzu = read.table(file="/Users/Alex/Dropbox/College/4-Senior/Machine Learning/Project1/kudzu.txt", header=TRUE, sep="\t")

by(kudzu$BMD, kudzu$Treatment, sd)

## kudzu$Treatment: Control
## [1] 0.01158735
## -------------------------------------------------------- 
## kudzu$Treatment: HighDose
## [1] 0.01877105
## -------------------------------------------------------- 
## kudzu$Treatment: LowDose
## [1] 0.01151066

kudzuModel <- lm(BMD ~ Treatment, data=kudzu)
anova(kudzuModel)

## Analysis of Variance Table
## 
## Response: BMD
##           Df    Sum Sq    Mean Sq F value   Pr(>F)   
## Treatment  2 0.0031856 0.00159282  7.7182 0.001397 **
## Residuals 42 0.0086676 0.00020637                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Problem 8

ss06hid = read.csv(file="/Users/Alex/Dropbox/College/4-Senior/Machine Learning/Project1/ss06hid.csv", header=TRUE, sep=",")
length(which(ss06hid$VAL==24))

## [1] 53

Problem 9

country = read.csv(file="/Users/Alex/Dropbox/College/4-Senior/Machine Learning/Project1/Country.txt", header=TRUE, sep=",")

popDensity <- log(country$Population/country$Area)
hist(popDensity, main="Histogram of Logarithm of Population Density", xlab="Log of Population Density")

Problem 10

library(maps)
city = read.csv(file="/Users/Alex/Dropbox/College/4-Senior/Machine Learning/Project1/City.csv", header=TRUE, sep=",")

map()
points(city$Longitude, city$Latitude, col="red", pch=19, cex=.5)