#importing libraries needed for assignment
library(readr)
library(plyr)
library(dplyr)
# documentation for dataset found at https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/doc/boot/claridge.html
## reading in dataset about Genetic Links to Left-handedness: 2 cols x 37 rows - readying directly from github repo
df = read_csv("https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/claridge.csv")
#original data frame
df
## # A tibble: 37 x 3
## X1 dnan hand
## <dbl> <dbl> <dbl>
## 1 1 13 1
## 2 2 18 1
## 3 3 20 3
## 4 4 21 1
## 5 5 21 1
## 6 6 24 1
## 7 7 24 1
## 8 8 27 1
## 9 9 28 1
## 10 10 28 2
## # ... with 27 more rows
#renaming columns since one is blank and they are not descriptive
df <- df %>% rename(test_subject = X1, DNA_measurement = dnan, handedness = hand)
#summary statistics of dataset
summary(df)
## test_subject DNA_measurement handedness
## Min. : 1 Min. :13.00 Min. :1.000
## 1st Qu.:10 1st Qu.:28.00 1st Qu.:1.000
## Median :19 Median :29.00 Median :1.000
## Mean :19 Mean :28.51 Mean :1.703
## 3rd Qu.:28 3rd Qu.:31.00 3rd Qu.:2.000
## Max. :37 Max. :44.00 Max. :8.000
Overview of the dataset: The test_subject column is just a column that shows the test subjects numbers (1-37) and is not meaninful in any way other than to identify which subject belongs with which data. Looking at the DNA measurement column, it appears that the interquartile range is actually quite small, only 3. This tells us that there is not a ton of dipersion from the mean. It also looks like there may be some outliers as the min is 13 and the max is 44, which is significantly different than our mean of 28.51. The handedness column shows a score from 1 to 8, 1 being completely right handed for everything and 8 being completely left handed for everything, with the middle being some of both. In general, it looks like the vast majority of subjects are almost completly right hand dominant, based on the mean, median, and IQR values.
#calculations for mean and median for DNA_measurement column
dna_col_mean = mean(df$DNA_measurement)
dna_col_median = median(df$DNA_measurement)
sprintf("The mean of the DNA_measurement columns is %f and the median is %f", dna_col_mean, dna_col_median )
## [1] "The mean of the DNA_measurement columns is 28.513514 and the median is 29.000000"
#calculations for mean and median for handedness column
handedness_col_mean = mean(df$handedness)
handedness_col_median = median(df$handedness)
sprintf("The mean of the handedness columns is %f and the median is %f", handedness_col_mean, handedness_col_median )
## [1] "The mean of the handedness columns is 1.702703 and the median is 1.000000"
new_df <- df %>% select(DNA_measurement, handedness) %>% filter(DNA_measurement > 27 & DNA_measurement < 32)
new_df
## # A tibble: 24 x 2
## DNA_measurement handedness
## <dbl> <dbl>
## 1 28 1
## 2 28 2
## 3 28 1
## 4 28 2
## 5 28 1
## 6 28 4
## 7 28 1
## 8 28 1
## 9 29 1
## 10 29 1
## # ... with 14 more rows
new_df <- new_df %>% rename(measure = DNA_measurement, Right_1_Left_8 = handedness)
new_df
## # A tibble: 24 x 2
## measure Right_1_Left_8
## <dbl> <dbl>
## 1 28 1
## 2 28 2
## 3 28 1
## 4 28 2
## 5 28 1
## 6 28 4
## 7 28 1
## 8 28 1
## 9 29 1
## 10 29 1
## # ... with 14 more rows
and median for the same two attributes. Please compare:
#summary statistics for new_df
summary(new_df)
## measure Right_1_Left_8
## Min. :28.00 Min. :1.000
## 1st Qu.:28.00 1st Qu.:1.000
## Median :29.00 Median :1.000
## Mean :29.25 Mean :1.375
## 3rd Qu.:30.00 3rd Qu.:2.000
## Max. :31.00 Max. :4.000
measure_col_mean = mean(new_df$measure)
measure_col_median = median(new_df$measure)
sprintf("The mean of the DNA_measurement columns is %f and the median is %f", measure_col_mean, measure_col_median )
## [1] "The mean of the DNA_measurement columns is 29.250000 and the median is 29.000000"
#calculations for mean and median for handedness column
Right_1_Left_8_col_mean = mean(new_df$Right_1_Left_8)
Right_1_Left_8_col_median = median(new_df$Right_1_Left_8)
sprintf("The mean of the handedness columns is %f and the median is %f", Right_1_Left_8_col_mean, Right_1_Left_8_col_median )
## [1] "The mean of the handedness columns is 1.375000 and the median is 1.000000"
The mean and median of the DNA_measure column didn’t change much with the adjustments to the data frame I made. I believe this is because, as I mentioned before, there isn’t a ton of dispersion around the mean of this column, so when i filtered the dataset, I was merely removing some outliers. The handedness column median didn’t change, but the mean did. I believe in removing the outliers from the DNA_measurement column, I probably removed some of the stronger left handed subjects, which is bringing my average down closer to those who are more predominately right handed.
new_df <- new_df %>% mutate(Right_1_Left_8 = replace(Right_1_Left_8, Right_1_Left_8 == 1, 8))
new_df <- new_df %>% mutate(Right_1_Left_8 = replace(Right_1_Left_8, Right_1_Left_8 == 2, 7))
new_df <- new_df %>% mutate(Right_1_Left_8 = replace(Right_1_Left_8, Right_1_Left_8 == 3, 6))
new_df <- new_df %>% mutate(measure = replace(measure, Right_1_Left_8 == 8, 15.5))
new_df <- new_df %>% mutate(measure = replace(measure, Right_1_Left_8 == 7, 43))
new_df
## # A tibble: 24 x 2
## measure Right_1_Left_8
## <dbl> <dbl>
## 1 15.5 8
## 2 43 7
## 3 15.5 8
## 4 43 7
## 5 15.5 8
## 6 28 4
## 7 15.5 8
## 8 15.5 8
## 9 15.5 8
## 10 15.5 8
## # ... with 14 more rows
#Adjusting the options for the tibble to display all rows of data frame
options(tibble.print_max = Inf)
new_df
## # A tibble: 24 x 2
## measure Right_1_Left_8
## <dbl> <dbl>
## 1 15.5 8
## 2 43 7
## 3 15.5 8
## 4 43 7
## 5 15.5 8
## 6 28 4
## 7 15.5 8
## 8 15.5 8
## 9 15.5 8
## 10 15.5 8
## 11 15.5 8
## 12 43 7
## 13 43 7
## 14 15.5 8
## 15 15.5 8
## 16 15.5 8
## 17 15.5 8
## 18 43 7
## 19 15.5 8
## 20 15.5 8
## 21 15.5 8
## 22 15.5 8
## 23 15.5 8
## 24 43 7
my_df = read_csv("https://raw.githubusercontent.com/christianthieme/R-Bridge-Course/master/claridge.csv")
my_df
## # A tibble: 37 x 3
## X1 dnan hand
## <dbl> <dbl> <dbl>
## 1 1 13 1
## 2 2 18 1
## 3 3 20 3
## 4 4 21 1
## 5 5 21 1
## 6 6 24 1
## 7 7 24 1
## 8 8 27 1
## 9 9 28 1
## 10 10 28 2
## 11 11 28 1
## 12 12 28 2
## 13 13 28 1
## 14 14 28 4
## 15 15 28 1
## 16 16 28 1
## 17 17 29 1
## 18 18 29 1
## 19 19 29 1
## 20 20 29 2
## 21 21 29 2
## 22 22 29 1
## 23 23 29 1
## 24 24 30 1
## 25 25 30 1
## 26 26 30 2
## 27 27 30 1
## 28 28 31 1
## 29 29 31 1
## 30 30 31 1
## 31 31 31 1
## 32 32 31 2
## 33 33 33 6
## 34 34 33 1
## 35 35 34 1
## 36 36 41 4
## 37 37 44 8