aliens <- read.csv ("aliens.csv", header = TRUE, stringsAsFactors = TRUE)
source('special_functions.R')
 rm(list = ls())
aliens <- read.csv ("aliens.csv", header = TRUE, stringsAsFactors = TRUE)
source('special_functions.R')
my_sample <- make.my.sample(33002176, 100, aliens)
## Warning in RNGkind("Mersenne-Twister", "Inversion", "Rounding"): non-uniform
## 'Rounding' sampler used
  college.table <- table(my_sample$college)
college.table
## 
## Callisto   Europa Ganymede       Io 
##       30       25       28       17

Question 1

After looking at this table, I notice that the data shown is how many aliens, specifically from my sample, attend each of the four colleges.

politics.table <- table(my_sample$politics)
politics.table
## 
## Democrulite Independone Republicant 
##          35          32          33

After looking at this table, I notice that the data shown is how many aliens, specifically from my sample, belong to each of the political parties.

antennae.table <- table(my_sample$antennae)
antennae.table
## 
##    Curly Straight 
##       21       79

After looking at this table, I notice that the data shown is how many aliens, specifically from my sample, have either curly or straight antennae.

Question 2

barplot(college.table)

barplot(politics.table)

barplot(antennae.table)

### Question 3

barplot(college.table)

barplot(my_sample$age)

Question 4

barplot(my_sample$age, legend = T)

After adding the imput Legend T, I saw no difference, however I understand it may have not been applied right.

Question 5

I would conclude that the categorical variables I am looking at might be related to each other because college and age are alike.

Question 6
x <- c(30, 25, 28, 17) 
mean(x)
## [1] 25
x <- c(30, 25, 28, 17) 
median(x)
## [1] 26.5
x <- c(30, 25, 28, 17) 
var(x)
## [1] 32.66667
x <- c(30, 25, 28, 17) 
sd(x)
## [1] 5.715476
x <- c(30, 25, 28, 17) 
hist(x)

x <- c(30, 25, 28, 17) 
boxplot(x)

## Question 7 Based on the data, their are no evident outliers because the data is consistently close to each other in a numerical sense.

boxplot(my_sample$anxiety~my_sample$island, ylim = c(30, 70))

After running this argument, I see it does not give very specific data because the numbers to control the y-axis are so vague. If I was to remove the y-lim, the code would not run effectively. This sample shows that on Nanspucket, the level of anxiety differs more and on Blick and Plume, the levels seem more consistent with the population.

Question 10

boxplot(my_sample$age~my_sample$island, ylim = c(30, 70))

boxplot(my_sample$income~my_sample$island, ylim = c(100, 100000))

For this sample I am comparing the age and income of the aliens. The topics do not have anything to do with each other however it is interesting to see how much they differentiate in size because I chose to expand the y-lim for the income box plot in order to see more information.

Question 11

my_sample2 <- suppressWarnings(make.my.sample(330021761, 100, aliens))
boxplot(my_sample2$age~my_sample$island, ylim = c(30, 70))

boxplot(my_sample2$income~my_sample$island, ylim = c(100, 100000))

After running the same diagrams with a different sample. I notices some differences. With Income, in sample 2, the Blick aliens have a lower average in income while the opposite occurs for the residents of Nanspucket.For age, there is not much difference except for a noticable differents in the surplus of young aliens who live on Plume in sample 2.

Question 12

x <- c(510, 2, 38, 23, 38, 26, 21, 140, 98, 97, 46, 123, 76, 4, 152, 67, 94, 124, 113, 115, 181, 25, 65, 112, 108, 165, 73, 69, 133, 19, 12, 91, 84, 77, 66, 99, 111, 11, 38, 196) 
mean(x)
## [1] 91.05
x <- c(10, 2, 38, 23, 38, 26, 21, 140, 98, 97, 46, 123, 76, 4, 152, 67, 94, 124, 113, 115, 181, 25, 65, 112, 108, 165, 73, 69, 133, 19, 12, 91, 84, 77, 66, 99, 111, 11, 38, 196) 
boxplot(x)

x <- c(10, 2, 38, 23, 38, 26, 21, 140, 98, 97, 46, 123, 76, 4, 152, 67, 94, 124, 113, 115, 181, 25, 65, 112, 108, 165, 73, 69, 133, 19, 12, 91, 84, 77, 66, 99, 111, 11, 38, 196) 
hist(x)

## Question 13

x<- c (80, 81, 88, 83, 89, 66, 82, 82, 87, 85, 90, 93, 96, 94, 95, 97, 92, 91, 98, 99, 100, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 111, 113, 114, 115, 116, 117, 118, 119, 120)
mean(x)
## [1] 99.425
x<- c (80, 81, 88, 83, 89, 66, 82, 82, 87, 85, 90, 93, 96, 94, 95, 97, 92, 91, 98, 99, 100, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 111, 113, 114, 115, 116, 117, 118, 119, 120)
boxplot(x)

x<- c (80, 81, 88, 83, 89, 66, 82, 82, 87, 85, 90, 93, 96, 94, 95, 97, 92, 91, 98, 99, 100, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 111, 113, 114, 115, 116, 117, 118, 119, 120)
hist(x)

## Question 14

x<- c (10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 81, 85, 87, 121, 122, 123, 125, 12, 19, 13, 131, 110, 141, 147, 155, 160, 165, 170, 175, 180, 185, 190, 195, 200, 205)
mean(x)
## [1] 99.3
x<- c (10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 81, 85, 87, 121, 122, 123, 125, 12, 19, 13, 131, 110, 141, 147, 155, 160, 165, 170, 175, 180, 185, 190, 195, 200, 205)
boxplot(x)

x<- c (10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 81, 85, 87, 121, 122, 123, 125, 12, 19, 13, 131, 110, 141, 147, 155, 160, 165, 170, 175, 180, 185, 190, 195, 200, 205)
hist(x)

## Question 15

Based on the prior graphs, it is clear that histograms are better at showing the distributions of data. I think this is because you have a visual of the data in a way that is easy to read. To see the numbers based on bars, you can see the data fluctuate rather than on the boxplot.

Question 16

x<- c (10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 81, 85, 87, 121, 122, 123, 125, 12, 19, 13, 131, 110, 141, 147, 155, 160, 165, 170, 175, 180, 185, 190, 195, 200, 20005)
mean(x)
## [1] 594.3

When adding an extremely high value, the mean does a major jump and does not accuratly represent the data.

x<- c (10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 81, 85, 87, 121, 122, 123, 125, 12, 19, 13, 131, 110, 141, 147, 155, 160, 165, 170, 175, 180, 185, 190, 195, 200, 20005)
median(x)
## [1] 86

When adding a high value to the data set, the median does not change as much because besides this one large value, the rest of the data seems to be consistent.