Project Statement

The data are from an experiment which was designed to look for a relationship between a certain genetic characteristicand handedness.

Each woman also filled in a questionnaire regarding which hand they used for various tasks:

From these questionnaires a measure of hand preference was found for each mother. The scale of this measure goes from 1, indicating someone who always favours their right hand, to 8, indicating someone who always favours their left hand. Between these two extremes are people who favour one hand for some tasks and the other for other tasks.

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.0     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
# reading raw data from github
#Genetic Links to Left-handedness 

theUrl <- "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/boot/claridge.csv"

GLH <- read.table(file=theUrl, header = TRUE, sep=",")

# Rename columns 
colnames(GLH)[colnames(GLH)=="dnan"] <- "DNA"
colnames(GLH)[colnames(GLH)=="X"] <- "Subject"

#using tidyverse to easily add another column etc..

my_data <- as_tibble(GLH)
my_data
## # A tibble: 37 x 3
##    Subject   DNA  hand
##      <int> <int> <int>
##  1       1    13     1
##  2       2    18     1
##  3       3    20     3
##  4       4    21     1
##  5       5    21     1
##  6       6    24     1
##  7       7    24     1
##  8       8    27     1
##  9       9    28     1
## 10      10    28     2
## # ... with 27 more rows
# To Check and  get lenght of table=> check for number of patients surveyed
# Answer:  The no. of rows in table tells us there were 37 patients
vec <- length(rownames(GLH))
vec
## [1] 37
# ADD VARIABLE USING mutate()

my_data<- mutate(my_data, DNA_Stat = (DNA-mean(DNA))/sd(DNA))

# my_data after adding another column
my_data
## # A tibble: 37 x 4
##    Subject   DNA  hand DNA_Stat
##      <int> <int> <int>    <dbl>
##  1       1    13     1  -2.82  
##  2       2    18     1  -1.91  
##  3       3    20     3  -1.55  
##  4       4    21     1  -1.37  
##  5       5    21     1  -1.37  
##  6       6    24     1  -0.821 
##  7       7    24     1  -0.821 
##  8       8    27     1  -0.275 
##  9       9    28     1  -0.0934
## 10      10    28     2  -0.0934
## # ... with 27 more rows
head(my_data)
## # A tibble: 6 x 4
##   Subject   DNA  hand DNA_Stat
##     <int> <int> <int>    <dbl>
## 1       1    13     1   -2.82 
## 2       2    18     1   -1.91 
## 3       3    20     3   -1.55 
## 4       4    21     1   -1.37 
## 5       5    21     1   -1.37 
## 6       6    24     1   -0.821
summary(my_data)
##     Subject        DNA             hand          DNA_Stat       
##  Min.   : 1   Min.   :13.00   Min.   :1.000   Min.   :-2.82032  
##  1st Qu.:10   1st Qu.:28.00   1st Qu.:1.000   1st Qu.:-0.09336  
##  Median :19   Median :29.00   Median :1.000   Median : 0.08844  
##  Mean   :19   Mean   :28.51   Mean   :1.703   Mean   : 0.00000  
##  3rd Qu.:28   3rd Qu.:31.00   3rd Qu.:2.000   3rd Qu.: 0.45204  
##  Max.   :37   Max.   :44.00   Max.   :8.000   Max.   : 2.81541
# Question:  Do we have skewness or distortion from extreme high or low DNA # Scores?
#Answer:
# Normalized DNA scores showed that there were no high no. of extreeme high
# or low scores that distorted the DNA readings
#
#write.csv(my_data, file="left hand genetic study.csv", row.names=FALSE)
#getwd()

Data Exploration

## # A tibble: 6 x 4
##   Subject   DNA  hand DNA_Stat
##     <int> <int> <int>    <dbl>
## 1       1    13     1   -2.82 
## 2       2    18     1   -1.91 
## 3       3    20     3   -1.55 
## 4       4    21     1   -1.37 
## 5       5    21     1   -1.37 
## 6       6    24     1   -0.821
##     Subject        DNA             hand          DNA_Stat       
##  Min.   : 1   Min.   :13.00   Min.   :1.000   Min.   :-2.82032  
##  1st Qu.:10   1st Qu.:28.00   1st Qu.:1.000   1st Qu.:-0.09336  
##  Median :19   Median :29.00   Median :1.000   Median : 0.08844  
##  Mean   :19   Mean   :28.51   Mean   :1.703   Mean   : 0.00000  
##  3rd Qu.:28   3rd Qu.:31.00   3rd Qu.:2.000   3rd Qu.: 0.45204  
##  Max.   :37   Max.   :44.00   Max.   :8.000   Max.   : 2.81541

correlation between DNA vs Handeness

cor(my_data$DNA, my_data$hand)
## [1] 0.5087758
# 
# hypothesis of Study: 
# Ho: larger DNA values linked to a progressive shift away from 
#right-handednesss 
#
#answer:  Correlation is only higher than 0.5, which does not indicate a #high correlation between left handeness vs high DNA scores

Boxplot of data

boxplot(my_data,col = c("red","sienna","palevioletred1","royalblue2"))

# Question:  What kind of distribution of the DNA and Handeness ?
#
#Answer1:  Like the Histogram, The boxplot distribtion showed DNA mostly
# mostly normal with median in the 30s with a few outliers in the high 
# and low end of the scores

# Answer2: Also like the Histogram, the boxplot shows a very right skewed 
# distribution
#

Correlation Plot

require (ggplot2)
require(reshape2)
## Loading required package: reshape2
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(ggcorrplot)
require(scales)
## Loading required package: scales
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
# Compute a correlation matrix
data(my_data)
## Warning in data(my_data): data set 'my_data' not found
corr <- round(cor(my_data), 1)
head(corr[, 1:4])
##          Subject DNA hand DNA_Stat
## Subject      1.0 0.9  0.3      0.9
## DNA          0.9 1.0  0.5      1.0
## hand         0.3 0.5  1.0      0.5
## DNA_Stat     0.9 1.0  0.5      1.0
# Visualize the correlation matrix
# --------------------------------
# method = "square" (default)
ggcorrplot(corr)

#Compute actual correlation 


c <-cor(my_data$DNA, my_data$hand)

# Print correlation out:
print("correlaton score of DNA vs Handeness:")
## [1] "correlaton score of DNA vs Handeness:"
c
## [1] 0.5087758
# 
# hypothesis of Study: 
# Ho: larger DNA values linked to a progressive shift away from 
#right-handednesss 
#
#answer:  Correlation is only higher than 0.5, which does not indicate a #high correlation between left handeness vs high DNA scores

Conclusion: Objective of Study:

(1)The study was to show if there was correlation between shift away from right handeness (or more lefties) when the subject’s DNA score was high (2) High DNA scores in patients are linked to defective genes in their sons

The statistical analysis showed the following observations:

  1. The Subjects DNA score was normally distributed (mostly), This suggested that most subjects did not showed abnormally high DNA score which contributed to mutation of certain genes in their children
  2. The Subjects hand score is very right skewed (lower = right handness) which suggest that most are right handed which is reflected in most population dat sets
  3. Correlation between high DNA scores vs. left handeness is 0.5 which does Not suggest shifts toward left handeness as DNA scores gets higher
  4. Scatter plot also showed that only a handful of high DNA scores are showing extreme left handness; Most of sample is in the 30s DNA score range