by(iris$Sepal.Width, iris$Species, summary)
## iris$Species: setosa
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.300 3.200 3.400 3.428 3.675 4.400
## ------------------------------------------------------------
## iris$Species: versicolor
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 2.525 2.800 2.770 3.000 3.400
## ------------------------------------------------------------
## iris$Species: virginica
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.200 2.800 3.000 2.974 3.175 3.800
mtcars$cyl2 <- as.factor(mtcars$cyl)
boxplot(mtcars$mpg ~ mtcars$cyl2, xlab = "Number of Cylinders", ylab = "Miles per Gallon")
set.seed(1234)
temp = replicate(1000, runif(75, min = 0, max = 6))
hist(apply(temp, 2, mean))
mean(apply(temp, 2, mean))
## [1] 3.004816
sd(apply(temp, 2, mean))
## [1] 0.1993562
These results are consistant with the central limit theorem. If we make a histogram of the data, we can see that our sample means follow a normal distribution with a common approximate mean of 3 and standard deviation of 0.1993562. The more sample we collect, the more normal the distribution becomes.
geese=read.csv("C:/Users/Max Billante/Documents/Machine Learning/geese.txt", header = TRUE)
par(mfrow=c(1,2))
hist(log(geese$Aestimate))
hist(log(geese$Bestimate))
### Problem 5
data1=read.csv("C:/Users/Max Billante/Documents/Machine Learning/CerealSugar1979(2).txt", header = FALSE, sep = "\t")
data2=read.csv("C:/Users/Max Billante/Documents/Machine Learning/CerealSugar2006(2).txt", header = FALSE, sep = "\t")
names(data1) <- c("Cereal", "SugarContent")
names(data2) <- c("Cereal", "SugarContent")
t.test(data1$SugarContent, data2$SugarContent)
##
## Welch Two Sample t-test
##
## data: data1$SugarContent and data2$SugarContent
## t = -0.42044, df = 108.99, p-value = 0.675
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -6.861317 4.459737
## sample estimates:
## mean of x mean of y
## 26.76452 27.96531
Seeing that the p-value for this test is 0.675, there is not sufficent evidence that the mean sugar content of cereals has changed from 1979 to 2006.
retail=read.table("C:/Users/Max Billante/Documents/Machine Learning/retail.txt", header = TRUE)
model <- lm(retail$Gross.Sales~retail$Gross.Cash+retail$Cash.Items+retail$Gross.Check)
summary(model)
##
## Call:
## lm(formula = retail$Gross.Sales ~ retail$Gross.Cash + retail$Cash.Items +
## retail$Gross.Check)
##
## Residuals:
## Min 1Q Median 3Q Max
## -85.96 -29.67 -5.64 29.02 93.42
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -12.92979 22.40870 -0.577 0.5701
## retail$Gross.Cash 0.08669 0.46887 0.185 0.8551
## retail$Cash.Items 7.62916 2.86987 2.658 0.0147 *
## retail$Gross.Check 1.16480 0.13066 8.915 1.39e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50.68 on 21 degrees of freedom
## Multiple R-squared: 0.9307, Adjusted R-squared: 0.9208
## F-statistic: 94.02 on 3 and 21 DF, p-value: 2.462e-12
According to the summary of the model, Gross-Sales is expected to increase by 0.08669 units for every one unit increase in Gross-Cash, 7.62916 units for every one unit increase in Cash-Items, and 1.1648 units for every one unit increase in Gross-Check. If Gross-Cash, Cach-Items, and Gross-Check were all zero, Gross-Sales is predicted to be -12.92979.
kudzu=read.table("C:/Users/Max Billante/Documents/Machine Learning/kudzu.txt", header = TRUE)
anova_results <- aov(BMD~Treatment, data = kudzu)
summary(anova_results)
## Df Sum Sq Mean Sq F value Pr(>F)
## Treatment 2 0.003186 0.0015928 7.718 0.0014 **
## Residuals 42 0.008668 0.0002064
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
by(kudzu$BMD, kudzu$Treatment, sd)
## kudzu$Treatment: Control
## [1] 0.01158735
## ------------------------------------------------------------
## kudzu$Treatment: HighDose
## [1] 0.01877105
## ------------------------------------------------------------
## kudzu$Treatment: LowDose
## [1] 0.01151066
When we run the anova test, the null hypothesis is that the mean BMD for all three treatment groups are equal and the alternative hypothesis is that at least one of them are different. the F value of the anova is 0.0014 with is a good indicator that the null hypothesis is false and at least one of the means for BMD is different. using the by commend we see that the highest standard deviation is no more than two times either of the other standard deviations. This inicates that the anova test is appropriate for this data.
getdata=read.table("C:/Users/Max Billante/Documents/Machine Learning/getdatacsv.txt", header = TRUE, sep = "\t")
sum(getdata$VAL==24, na.rm=TRUE)
## [1] 53
MondialCity=read.table("C:/Users/Max Billante/Documents/Machine Learning/mondialcity.txt", header = TRUE, sep = "\t")
MondialCountry=read.table("C:/Users/Max Billante/Documents/Machine Learning/mondialcountry.txt", header = TRUE, sep = "\t")
MondailProvince=read.table("C:/Users/Max Billante/Documents/Machine Learning/mondialprovince.txt", header = TRUE, sep = "\t")
hist(log(MondialCountry$Population/MondialCountry$Area))
library(maps)
map("world")
points(c(MondialCity$Longitude), c(MondialCity$Latitude), pch=19, col = ("Red"))