Project 1

Problem 1

by(iris$Sepal.Width, iris$Species, summary)

## iris$Species: setosa
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.300   3.200   3.400   3.428   3.675   4.400 
## ------------------------------------------------------------ 
## iris$Species: versicolor
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   2.525   2.800   2.770   3.000   3.400 
## ------------------------------------------------------------ 
## iris$Species: virginica
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.200   2.800   3.000   2.974   3.175   3.800

Problem 2

mtcars$cyl2 <- as.factor(mtcars$cyl)
boxplot(mtcars$mpg ~ mtcars$cyl2, xlab = "Number of Cylinders", ylab = "Miles per Gallon")

Problem 3

set.seed(1234)
temp = replicate(1000, runif(75, min = 0, max = 6))
hist(apply(temp, 2, mean))

mean(apply(temp, 2, mean))

## [1] 3.004816

sd(apply(temp, 2, mean))

## [1] 0.1993562

These results are consistant with the central limit theorem. If we make a histogram of the data, we can see that our sample means follow a normal distribution with a common approximate mean of 3 and standard deviation of 0.1993562. The more sample we collect, the more normal the distribution becomes.

Problem 4

geese=read.csv("C:/Users/Max Billante/Documents/Machine Learning/geese.txt", header = TRUE)
par(mfrow=c(1,2))
hist(log(geese$Aestimate))
hist(log(geese$Bestimate))

### Problem 5

data1=read.csv("C:/Users/Max Billante/Documents/Machine Learning/CerealSugar1979(2).txt", header = FALSE, sep = "\t")
data2=read.csv("C:/Users/Max Billante/Documents/Machine Learning/CerealSugar2006(2).txt", header = FALSE, sep = "\t")
names(data1) <- c("Cereal", "SugarContent")
names(data2) <- c("Cereal", "SugarContent")
t.test(data1$SugarContent, data2$SugarContent)

## 
##  Welch Two Sample t-test
## 
## data:  data1$SugarContent and data2$SugarContent
## t = -0.42044, df = 108.99, p-value = 0.675
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -6.861317  4.459737
## sample estimates:
## mean of x mean of y 
##  26.76452  27.96531

Seeing that the p-value for this test is 0.675, there is not sufficent evidence that the mean sugar content of cereals has changed from 1979 to 2006.

Problem 6

retail=read.table("C:/Users/Max Billante/Documents/Machine Learning/retail.txt", header = TRUE)
model <- lm(retail$Gross.Sales~retail$Gross.Cash+retail$Cash.Items+retail$Gross.Check)
summary(model)

## 
## Call:
## lm(formula = retail$Gross.Sales ~ retail$Gross.Cash + retail$Cash.Items + 
##     retail$Gross.Check)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -85.96 -29.67  -5.64  29.02  93.42 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -12.92979   22.40870  -0.577   0.5701    
## retail$Gross.Cash    0.08669    0.46887   0.185   0.8551    
## retail$Cash.Items    7.62916    2.86987   2.658   0.0147 *  
## retail$Gross.Check   1.16480    0.13066   8.915 1.39e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50.68 on 21 degrees of freedom
## Multiple R-squared:  0.9307, Adjusted R-squared:  0.9208 
## F-statistic: 94.02 on 3 and 21 DF,  p-value: 2.462e-12

According to the summary of the model, Gross-Sales is expected to increase by 0.08669 units for every one unit increase in Gross-Cash, 7.62916 units for every one unit increase in Cash-Items, and 1.1648 units for every one unit increase in Gross-Check. If Gross-Cash, Cach-Items, and Gross-Check were all zero, Gross-Sales is predicted to be -12.92979.

Problem 7

kudzu=read.table("C:/Users/Max Billante/Documents/Machine Learning/kudzu.txt", header = TRUE)
anova_results <- aov(BMD~Treatment, data = kudzu)
summary(anova_results)

##             Df   Sum Sq   Mean Sq F value Pr(>F)   
## Treatment    2 0.003186 0.0015928   7.718 0.0014 **
## Residuals   42 0.008668 0.0002064                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

by(kudzu$BMD, kudzu$Treatment, sd)

## kudzu$Treatment: Control
## [1] 0.01158735
## ------------------------------------------------------------ 
## kudzu$Treatment: HighDose
## [1] 0.01877105
## ------------------------------------------------------------ 
## kudzu$Treatment: LowDose
## [1] 0.01151066

When we run the anova test, the null hypothesis is that the mean BMD for all three treatment groups are equal and the alternative hypothesis is that at least one of them are different. the F value of the anova is 0.0014 with is a good indicator that the null hypothesis is false and at least one of the means for BMD is different. using the by commend we see that the highest standard deviation is no more than two times either of the other standard deviations. This inicates that the anova test is appropriate for this data.

Problem 8

getdata=read.table("C:/Users/Max Billante/Documents/Machine Learning/getdatacsv.txt", header = TRUE, sep = "\t")
sum(getdata$VAL==24, na.rm=TRUE)

## [1] 53

Problems 9 & 10

MondialCity=read.table("C:/Users/Max Billante/Documents/Machine Learning/mondialcity.txt", header = TRUE, sep = "\t")
MondialCountry=read.table("C:/Users/Max Billante/Documents/Machine Learning/mondialcountry.txt", header = TRUE, sep = "\t")
MondailProvince=read.table("C:/Users/Max Billante/Documents/Machine Learning/mondialprovince.txt", header = TRUE, sep = "\t")
hist(log(MondialCountry$Population/MondialCountry$Area))

library(maps)
map("world")
points(c(MondialCity$Longitude), c(MondialCity$Latitude), pch=19, col = ("Red"))