aliens <- read.csv ("aliens.csv", header = TRUE, stringsAsFactors = TRUE)
source('special_functions.R')
rm(list = ls())
aliens <- read.csv ("aliens.csv", header = TRUE, stringsAsFactors = TRUE)
source('special_functions.R')
my_sample <- make.my.sample(33002176, 100, aliens)
## Warning in RNGkind("Mersenne-Twister", "Inversion", "Rounding"): non-uniform
## 'Rounding' sampler used
college.table <- table(my_sample$college)
college.table
##
## Callisto Europa Ganymede Io
## 30 25 28 17
After looking at this table, I notice that the data shown is how many aliens, specifically from my sample, attend each of the four colleges.
politics.table <- table(my_sample$politics)
politics.table
##
## Democrulite Independone Republicant
## 35 32 33
After looking at this table, I notice that the data shown is how many aliens, specifically from my sample, belong to each of the political parties.
antennae.table <- table(my_sample$antennae)
antennae.table
##
## Curly Straight
## 21 79
After looking at this table, I notice that the data shown is how many aliens, specifically from my sample, have either curly or straight antennae.
barplot(college.table)
barplot(politics.table)
barplot(antennae.table)
### Question 3
barplot(college.table)
barplot(my_sample$age)
barplot(my_sample$age, legend = T)
After adding the imput Legend T, I saw no difference, however I
understand it may have not been applied right.
I would conclude that the categorical variables I am looking at might be related to each other because college and age are alike.
x <- c(30, 25, 28, 17)
mean(x)
## [1] 25
x <- c(30, 25, 28, 17)
median(x)
## [1] 26.5
x <- c(30, 25, 28, 17)
var(x)
## [1] 32.66667
x <- c(30, 25, 28, 17)
sd(x)
## [1] 5.715476
x <- c(30, 25, 28, 17)
hist(x)
x <- c(30, 25, 28, 17)
boxplot(x)
## Question 7 Based on the data, their are no evident outliers because
the data is consistently close to each other in a numerical sense.
boxplot(my_sample$anxiety~my_sample$island, ylim = c(30, 70))
After running this argument, I see it does not give very specific data
because the numbers to control the y-axis are so vague. If I was to
remove the y-lim, the code would not run effectively. This sample shows
that on Nanspucket, the level of anxiety differs more and on Blick and
Plume, the levels seem more consistent with the population.
boxplot(my_sample$age~my_sample$island, ylim = c(30, 70))
boxplot(my_sample$income~my_sample$island, ylim = c(100, 100000))
For this sample I am comparing the age and income of the aliens. The topics do not have anything to do with each other however it is interesting to see how much they differentiate in size because I chose to expand the y-lim for the income box plot in order to see more information.
my_sample2 <- suppressWarnings(make.my.sample(330021761, 100, aliens))
boxplot(my_sample2$age~my_sample$island, ylim = c(30, 70))
boxplot(my_sample2$income~my_sample$island, ylim = c(100, 100000))
After running the same diagrams with a different sample. I notices some
differences. With Income, in sample 2, the Blick aliens have a lower
average in income while the opposite occurs for the residents of
Nanspucket.For age, there is not much difference except for a noticable
differents in the surplus of young aliens who live on Plume in sample
2.
x <- c(510, 2, 38, 23, 38, 26, 21, 140, 98, 97, 46, 123, 76, 4, 152, 67, 94, 124, 113, 115, 181, 25, 65, 112, 108, 165, 73, 69, 133, 19, 12, 91, 84, 77, 66, 99, 111, 11, 38, 196)
mean(x)
## [1] 91.05
x <- c(10, 2, 38, 23, 38, 26, 21, 140, 98, 97, 46, 123, 76, 4, 152, 67, 94, 124, 113, 115, 181, 25, 65, 112, 108, 165, 73, 69, 133, 19, 12, 91, 84, 77, 66, 99, 111, 11, 38, 196)
boxplot(x)
x <- c(10, 2, 38, 23, 38, 26, 21, 140, 98, 97, 46, 123, 76, 4, 152, 67, 94, 124, 113, 115, 181, 25, 65, 112, 108, 165, 73, 69, 133, 19, 12, 91, 84, 77, 66, 99, 111, 11, 38, 196)
hist(x)
## Question 13
x<- c (80, 81, 88, 83, 89, 66, 82, 82, 87, 85, 90, 93, 96, 94, 95, 97, 92, 91, 98, 99, 100, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 111, 113, 114, 115, 116, 117, 118, 119, 120)
mean(x)
## [1] 99.425
x<- c (80, 81, 88, 83, 89, 66, 82, 82, 87, 85, 90, 93, 96, 94, 95, 97, 92, 91, 98, 99, 100, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 111, 113, 114, 115, 116, 117, 118, 119, 120)
boxplot(x)
x<- c (80, 81, 88, 83, 89, 66, 82, 82, 87, 85, 90, 93, 96, 94, 95, 97, 92, 91, 98, 99, 100, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 111, 113, 114, 115, 116, 117, 118, 119, 120)
hist(x)
## Question 14
x<- c (10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 81, 85, 87, 121, 122, 123, 125, 12, 19, 13, 131, 110, 141, 147, 155, 160, 165, 170, 175, 180, 185, 190, 195, 200, 205)
mean(x)
## [1] 99.3
x<- c (10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 81, 85, 87, 121, 122, 123, 125, 12, 19, 13, 131, 110, 141, 147, 155, 160, 165, 170, 175, 180, 185, 190, 195, 200, 205)
boxplot(x)
x<- c (10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 81, 85, 87, 121, 122, 123, 125, 12, 19, 13, 131, 110, 141, 147, 155, 160, 165, 170, 175, 180, 185, 190, 195, 200, 205)
hist(x)
## Question 15
Based on the prior graphs, it is clear that histograms are better at showing the distributions of data. I think this is because you have a visual of the data in a way that is easy to read. To see the numbers based on bars, you can see the data fluctuate rather than on the boxplot.
x<- c (10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 81, 85, 87, 121, 122, 123, 125, 12, 19, 13, 131, 110, 141, 147, 155, 160, 165, 170, 175, 180, 185, 190, 195, 200, 20005)
mean(x)
## [1] 594.3
When adding an extremely high value, the mean does a major jump and does not accuratly represent the data.
x<- c (10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 81, 85, 87, 121, 122, 123, 125, 12, 19, 13, 131, 110, 141, 147, 155, 160, 165, 170, 175, 180, 185, 190, 195, 200, 20005)
median(x)
## [1] 86
When adding a high value to the data set, the median does not change as much because besides this one large value, the rest of the data seems to be consistent.