Loaded the required packages requirement #10.
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(outliers)
library(ggplot2)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
This report is dedicated to pre-process the data so that it could be used to analyze further if there is a statistical significance between Life Expectancy and different Mental Health Disorders. This dataset has information for each country by Gender. Read the datasets using read() function. Understood each data using the summary() function. Un-tidy the data, as the column variables were values. Applied the required data conversions. Merged all the five datasets using left_join. Checked for missing values. Some missing values were obvious errors. Such as the countries were different in different datasets. Revalued the names and merged the data again. And rest missing values were less than 5%, so dropped these missing tuples. Checked for outliers, we found outliers, but these weren’t bad outliers. These were actual measurements. Some countries may have extreme conditions of mental health disorders. So, I didn’t remove the outliers. Transformed the data to form a normal distribution by applying the appropriate transformation functions.
In this analysis we will be using 5 datasets on variaous mental health disorders and the life expectancy by sex. The Life expectancy dataset was collected by the World Health Organization. The dataset was downloaded from this link: https://apps.who.int/gho/data/node.main.688?lang=en. This dataset contains information on the Life expectancy at Birth, Life expectancy at age 60, Health Life expectancy at Birth and Health Life expectancy at age 60 of 182 Countries by gender from the year 2000 to 2016. The dataset has 3111 tuples and 14 attributes. Country - There are 182 Countries Year - Year from 2000 to 2016 life1[3:5] -> Life Expectancy at Birth for Both sexes, Male, Female respectively. life1[6:8] -> Life Expectancy at age 60 for Both sexes, Male, Female respectively. life1[9:11] -> Healthy Life Expectancy at Birth for Both sexes, Male, Female respectively. life1[12:14] -> Healthy Life Expectancy at age 60 for Both sexes, Male, Female respectively.
The Anxiety Disorder, Depression, bipolar Disorder, Schizophrenia datasets were all collected by Hannah Ritchie and Max Roser from “Our World in Data” organization. These datasets were downloaded from this link: https://ourworldindata.org/mental-health. These datasets contains information on different mental disorder prevalence for 188 countries by gender from the year 1880 to 2016.(But data is only available from 1990 to 2016) Each of these mental disorder datasets has 47808 tuples and 6 attributes. Entity - There are 188 Countries Code - Code for each Country Year - Year from 1990 to 2016 Share of males (%) - Prevalence of Male for different mental disorder across different datasets. Share of females (%)- Prevalence of Female for different mental disorder across different datasets. Population - Population for each country. In this analysis we will look into the year 2014-2016.
setwd("C:/Users/USER/Desktop")
life1 <- read_csv("WHOSIS_000001,WHOSIS_000015,WHOSIS_000002,WHOSIS_000007.csv", skip=1)
## Warning: Duplicated column names deduplicated: 'Both sexes' => 'Both
## sexes_1' [6], 'Male' => 'Male_1' [7], 'Female' => 'Female_1' [8], 'Both sexes'
## => 'Both sexes_2' [9], 'Male' => 'Male_2' [10], 'Female' => 'Female_2' [11],
## 'Both sexes' => 'Both sexes_3' [12], 'Male' => 'Male_3' [13], 'Female' =>
## 'Female_3' [14]
## Parsed with column specification:
## cols(
## Country = col_character(),
## Year = col_double(),
## `Both sexes` = col_double(),
## Male = col_double(),
## Female = col_double(),
## `Both sexes_1` = col_double(),
## Male_1 = col_double(),
## Female_1 = col_double(),
## `Both sexes_2` = col_double(),
## Male_2 = col_double(),
## Female_2 = col_double(),
## `Both sexes_3` = col_double(),
## Male_3 = col_double(),
## Female_3 = col_double()
## )
## Warning: 108 parsing failures.
## row col expected actual file
## 2314 Both sexes no trailing characters 64.8 'WHOSIS_000001,WHOSIS_000015,WHOSIS_000002,WHOSIS_000007.csv'
## 2314 Male no trailing characters 65.7 'WHOSIS_000001,WHOSIS_000015,WHOSIS_000002,WHOSIS_000007.csv'
## 2314 Female no trailing characters 67.4 'WHOSIS_000001,WHOSIS_000015,WHOSIS_000002,WHOSIS_000007.csv'
## 2314 Both sexes_1 no trailing characters 18.4 'WHOSIS_000001,WHOSIS_000015,WHOSIS_000002,WHOSIS_000007.csv'
## 2314 Male_1 no trailing characters 16.4 'WHOSIS_000001,WHOSIS_000015,WHOSIS_000002,WHOSIS_000007.csv'
## .... ............ ...................... ...... .............................................................
## See problems(...) for more details.
head(life1)
anxiety <- read_csv("prevalence-of-anxiety-disorders-males-vs-females.csv")
## Parsed with column specification:
## cols(
## Entity = col_character(),
## Code = col_character(),
## Year = col_double(),
## `Share of males (%)` = col_double(),
## `Share of females (%)` = col_double(),
## `Total population (Gapminder)` = col_double()
## )
head(anxiety) #There is no data available from 1880 to 1889
dep <- read_csv("prevalence-of-depression-males-vs-females.csv")
## Parsed with column specification:
## cols(
## Entity = col_character(),
## Code = col_character(),
## Year = col_character(),
## `Prevalence in males (%)` = col_double(),
## `Prevalence in females (%)` = col_double(),
## Population = col_double()
## )
head(dep) #There is no data available from 1880 to 1889
bipolar<- read_csv("prevalence-of-bipolar-disorder-in-males-vs-females.csv")
## Parsed with column specification:
## cols(
## Entity = col_character(),
## Code = col_character(),
## Year = col_character(),
## `Share of males (%)` = col_double(),
## `Share of females (%)` = col_double(),
## Population = col_double()
## )
head(bipolar) #There is no data available from 1880 to 1889
schizo <- read_csv("prevalence-of-schizophrenia-in-males-vs-females.csv")
## Parsed with column specification:
## cols(
## Entity = col_character(),
## Code = col_character(),
## Year = col_character(),
## `Share of males (%)` = col_double(),
## `Share of females (%)` = col_double(),
## Population = col_double()
## )
head(schizo) #There is no data available from 1880 to 1889
dep <-dep %>% filter(Year %in% c("2014", "2015", "2016"))
summary(dep)
## Entity Code Year
## Length:825 Length:825 Length:825
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Prevalence in males (%) Prevalence in females (%) Population
## Min. :1.645 Min. :2.348 Min. :1.000e+03
## 1st Qu.:2.391 1st Qu.:3.500 1st Qu.:4.162e+05
## Median :2.778 Median :4.115 Median :5.578e+06
## Mean :2.791 Mean :4.094 Mean :9.148e+07
## 3rd Qu.:3.103 3rd Qu.:4.549 3rd Qu.:2.359e+07
## Max. :4.771 Max. :8.057 Max. :7.464e+09
## NA's :132 NA's :132 NA's :99
anxiety <-anxiety %>% filter(Year %in% c("2014", "2015", "2016"))
summary(anxiety)
## Entity Code Year Share of males (%)
## Length:693 Length:693 Min. :2014 Min. :1.438
## Class :character Class :character 1st Qu.:2014 1st Qu.:2.399
## Mode :character Mode :character Median :2015 Median :2.582
## Mean :2015 Mean :2.959
## 3rd Qu.:2016 3rd Qu.:3.462
## Max. :2016 Max. :6.073
##
## Share of females (%) Total population (Gapminder)
## Min. : 2.609 Min. : NA
## 1st Qu.: 3.884 1st Qu.: NA
## Median : 4.527 Median : NA
## Mean : 5.030 Mean :NaN
## 3rd Qu.: 5.944 3rd Qu.: NA
## Max. :10.828 Max. : NA
## NA's :693
bipolar <-bipolar %>% filter(Year %in% c("2014", "2015", "2016"))
summary(bipolar)
## Entity Code Year Share of males (%)
## Length:825 Length:825 Length:825 Min. :0.3303
## Class :character Class :character Class :character 1st Qu.:0.5995
## Mode :character Mode :character Mode :character Median :0.6494
## Mean :0.6851
## 3rd Qu.:0.7997
## Max. :1.1048
## NA's :132
## Share of females (%) Population
## Min. :0.3125 Min. :1.000e+03
## 1st Qu.:0.6382 1st Qu.:4.162e+05
## Median :0.7291 Median :5.578e+06
## Mean :0.7582 Mean :9.148e+07
## 3rd Qu.:0.8866 3rd Qu.:2.359e+07
## Max. :1.2952 Max. :7.464e+09
## NA's :132 NA's :99
schizo <-schizo %>% filter(Year %in% c("2014", "2015", "2016"))
summary(schizo)
## Entity Code Year Share of males (%)
## Length:825 Length:825 Length:825 Min. :0.1487
## Class :character Class :character Class :character 1st Qu.:0.1873
## Mode :character Mode :character Mode :character Median :0.2111
## Mean :0.2208
## 3rd Qu.:0.2519
## Max. :0.3846
## NA's :132
## Share of females (%) Population
## Min. :0.1486 Min. :1.000e+03
## 1st Qu.:0.1803 1st Qu.:4.162e+05
## Median :0.1980 Median :5.578e+06
## Mean :0.2078 Mean :9.148e+07
## 3rd Qu.:0.2275 3rd Qu.:2.359e+07
## Max. :0.3610 Max. :7.464e+09
## NA's :132 NA's :99
life1 <- life1 %>% filter(Year %in% c("2014", "2015", "2016"))
summary(life1)
## Country Year Both sexes Male
## Length:549 Min. :2014 Min. :49.60 Min. :48.90
## Class :character 1st Qu.:2014 1st Qu.:65.65 1st Qu.:63.60
## Mode :character Median :2015 Median :73.20 Median :70.00
## Mean :2015 Mean :71.51 Mean :69.15
## 3rd Qu.:2016 3rd Qu.:76.75 3rd Qu.:74.40
## Max. :2016 Max. :84.20 Max. :81.20
## NA's :2 NA's :2
## Female Both sexes_1 Male_1 Female_1
## Min. :50.20 Min. :12.80 Min. :12.70 Min. :12.80
## 1st Qu.:67.45 1st Qu.:17.20 1st Qu.:15.90 1st Qu.:18.10
## Median :76.10 Median :19.40 Median :17.60 Median :20.90
## Mean :73.90 Mean :19.67 Mean :18.24 Mean :20.95
## 3rd Qu.:79.40 3rd Qu.:21.80 3rd Qu.:20.30 3rd Qu.:23.50
## Max. :87.10 Max. :26.40 Max. :24.30 Max. :28.90
## NA's :2 NA's :2 NA's :2 NA's :2
## Both sexes_2 Male_2 Female_2 Both sexes_3
## Min. :44.60 Min. :43.90 Min. :45.30 Min. :10.10
## 1st Qu.:57.70 1st Qu.:56.30 1st Qu.:59.20 1st Qu.:13.00
## Median :64.90 Median :62.40 Median :66.40 Median :14.70
## Mean :63.08 Mean :61.44 Mean :64.71 Mean :15.08
## 3rd Qu.:67.90 3rd Qu.:65.80 3rd Qu.:69.70 3rd Qu.:16.90
## Max. :76.20 Max. :74.70 Max. :77.60 Max. :21.00
## NA's :184 NA's :184 NA's :184 NA's :184
## Male_3 Female_3
## Min. :10.00 Min. :10.20
## 1st Qu.:12.20 1st Qu.:13.60
## Median :13.50 Median :15.90
## Mean :14.04 Mean :16.02
## 3rd Qu.:15.60 3rd Qu.:18.10
## Max. :19.60 Max. :22.90
## NA's :184 NA's :184
#REQUIRMENT 3, 4 & 5 * The datasets such as Life Expectancy, Anxiety, depression, bipolar disorder and Schizophrenia are untidy. The column names(Female, Male) are values, they shouldn’t be variable names. We need to tidy these datasets before applying the appropriate data type conversions. * Tidying the depression dataset - #REQUIRMENT 5 1. We first rename the columns Entity -> Country, Share of males (%)-> Male_DepressionPrevalence(%), Share of females (%) -> Female_DepressionPrevalence(%) using names() function. 2. Then we gather the two columns Male_DepressionPrevalence(%) and Female_DepressionPrevalence(%) under one column named Gender1using gather() function. 3. We now seperate Gender1 into 2 columns Gender and ALL using seperate() function by passing sep=_ argument. 4. And finally, we spread the column ALL using the spread() function. We now have a Gender(Female, Male) and DepressionPrevalence(%) for each gender * We repeat step no. 1 to 4 for the Anxiety, bipolar disorder and Schizophrenia datasets.We do the same tidying steps for life1 dataset as well, for the columns life1[3:14]. We save these changes in “life” dataframe. * We filter out “Both” from the Gender in life data set, as we aim to see analysis for female and male. Gender variable in all the dataset was a character datatype. Changed it to categorical datatype using factor() function. #REQUIRMENT 3 & 4. * Year variable was a character variable in all the datasets, changed to numeric datatype using as.numeric() function. #REQUIRMENT 3.
names(dep)[1] <- paste("Country")
names(dep)[4] <- paste("Male_DepressionPrevalence(%)")
names(dep)[5] <- paste("Female_DepressionPrevalence(%)")
dep <- dep %>% gather(Gender1, Values, 4:5) %>%
separate(Gender1, into = c("Gender", "ALL"), sep = "_") %>% spread(ALL,Values)
dep$Gender <- factor(dep$Gender, labels = c("Female", "Male"))
dep$Year <- as.numeric(dep$Year)
dep <- dep[-c(2,4)]
head(dep)
names(anxiety)[1] <- paste("Country")
names(anxiety)[4] <- paste("Male_AnxietyPrevalence(%)")
names(anxiety)[5] <- paste("Female_AnxietyPrevalence(%)")
anxiety <- anxiety %>% gather(Gender1, Values, 4:5) %>%
separate(Gender1, into = c("Gender", "ALL"), sep = "_") %>% spread(ALL,Values)
anxiety$Gender <- factor(anxiety$Gender, labels = c("Female", "Male"))
anxiety$Year <- as.numeric(anxiety$Year)
anxiety <- anxiety[-c(2,4)]
head(anxiety)
names(bipolar)[1] <- paste("Country")
names(bipolar)[4] <- paste("Male_bipolarPrevalence(%)")
names(bipolar)[5] <- paste("Female_bipolarPrevalence(%)")
bipolar <- bipolar %>% gather(Gender1, Values, 4:5) %>%
separate(Gender1, into = c("Gender", "ALL"), sep = "_") %>% spread(ALL,Values)
bipolar$Gender <- factor(bipolar$Gender, labels = c("Female", "Male"))
bipolar <- bipolar[-c(2,4)]
bipolar$Year <- as.numeric(bipolar$Year)
head(bipolar)
names(schizo)[1] <- paste("Country")
names(schizo)[4] <- paste("Male_SchizophreniaPrevalence(%)")
names(schizo)[5] <- paste("Female_SchizophreniaPrevalence(%)")
schizo <- schizo %>% gather(Gender1, Values, 4:5) %>%
separate(Gender1, into = c("Gender", "ALL"), sep = "_") %>% spread(ALL,Values)
schizo$Gender <- factor(schizo$Gender, labels = c("Female", "Male"))
schizo <- schizo[-c(2,4)]
schizo$Year <- as.numeric(schizo$Year)
head(schizo)
names(life1)[3] <- paste("Both_LifeExpBirth")
names(life1)[4] <- paste("Male_LifeExpBirth")
names(life1)[5] <- paste("Female_LifeExpBirth")
names(life1)[6] <- paste("Both_LifeExpSixty")
names(life1)[7] <- paste("Male_LifeExpSixty")
names(life1)[8] <- paste("Female_LifeExpSixty")
names(life1)[9] <- paste("Both_HALEBirth")
names(life1)[10] <- paste("Male_HALEBirth")
names(life1)[11] <- paste("Female_HALEBirth")
names(life1)[12] <- paste("Both_HALESixty")
names(life1)[13] <- paste("Male_HALESixty")
names(life1)[14] <- paste("Female_HALESixty")
life <- life1 %>% gather(Gender1, Values, 3:14) %>%
separate(Gender1, into = c("Gender", "ALL"), sep = "_") %>% spread(ALL,Values)
life$Gender <- factor(life$Gender, labels = c("Both", "Female", "Male"))
life <- life[-c(4:5)]
life <- life %>% filter(Gender != "Both")
head(life)
Country, one factor variable -> Gender, and remaining are numerical variables. #REQUIRMENT 2 & 4. The merged data variables are Country, Year, Gender, LifexpBirth, LifexpSixty, DepressionPrevalence(%), AnxietyPrevalence(%), bipolarPrevalence(%), SchizophreniaPrevalence(%).MentalHealth <- life %>% left_join(dep, by = c("Country","Year", "Gender")) %>% left_join(anxiety, by = c("Country","Year", "Gender")) %>% left_join(bipolar, by = c("Country","Year", "Gender")) %>% left_join(schizo, by = c("Country","Year", "Gender"))
dim(MentalHealth)
## [1] 1098 9
head(MentalHealth)
str(MentalHealth)
## tibble [1,098 x 9] (S3: tbl_df/tbl/data.frame)
## $ Country : chr [1:1098] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ Year : num [1:1098] 2014 2014 2015 2015 2016 ...
## $ Gender : Factor w/ 3 levels "Both","Female",..: 2 3 2 3 2 3 2 3 2 3 ...
## $ LifeExpBirth : num [1:1098] 64.4 61.7 64.7 61.8 64.5 61 78.5 74 78.2 74.2 ...
## $ LifeExpSixty : num [1:1098] 17 15.4 17.1 15.5 17.1 15.5 22.2 19.3 21.8 19.1 ...
## $ DepressionPrevalence(%) : num [1:1098] 4.69 3.57 4.69 3.57 4.69 ...
## $ AnxietyPrevalence(%) : num [1:1098] 6.11 3.63 6.11 3.64 6.12 ...
## $ bipolarPrevalence(%) : num [1:1098] 0.724 0.689 0.725 0.69 0.725 ...
## $ SchizophreniaPrevalence(%): num [1:1098] 0.161 0.17 0.162 0.17 0.162 ...
# This is the R chunk for the Scan I
sapply(MentalHealth, function(x) sum(is.na(x))) #We have 132 missing values in each disorder
## Country Year
## 0 0
## Gender LifeExpBirth
## 0 4
## LifeExpSixty DepressionPrevalence(%)
## 4 132
## AnxietyPrevalence(%) bipolarPrevalence(%)
## 132 132
## SchizophreniaPrevalence(%)
## 132
sapply(MentalHealth, function(x) which(is.na(x))) #We find where does it exist. All these 132 values in each disorder is for the same country.
## $Country
## integer(0)
##
## $Year
## integer(0)
##
## $Gender
## integer(0)
##
## $LifeExpBirth
## [1] 817 818 819 820
##
## $LifeExpSixty
## [1] 817 818 819 820
##
## $`DepressionPrevalence(%)`
## [1] 115 116 117 118 119 120 139 140 141 142 143 144 163 164 165
## [16] 166 167 168 235 236 237 238 239 240 259 260 261 262 263 264
## [31] 265 266 267 268 269 270 271 272 273 274 275 276 331 332 333
## [46] 334 335 336 463 464 465 466 467 468 541 542 543 544 545 546
## [61] 643 644 645 646 647 648 787 788 789 790 791 792 793 794 795
## [76] 796 797 798 799 800 801 802 803 804 811 812 813 814 815 816
## [91] 955 956 957 958 959 960 973 974 975 976 977 978 1033 1034 1035
## [106] 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050
## [121] 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080
##
## $`AnxietyPrevalence(%)`
## [1] 115 116 117 118 119 120 139 140 141 142 143 144 163 164 165
## [16] 166 167 168 235 236 237 238 239 240 259 260 261 262 263 264
## [31] 265 266 267 268 269 270 271 272 273 274 275 276 331 332 333
## [46] 334 335 336 463 464 465 466 467 468 541 542 543 544 545 546
## [61] 643 644 645 646 647 648 787 788 789 790 791 792 793 794 795
## [76] 796 797 798 799 800 801 802 803 804 811 812 813 814 815 816
## [91] 955 956 957 958 959 960 973 974 975 976 977 978 1033 1034 1035
## [106] 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050
## [121] 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080
##
## $`bipolarPrevalence(%)`
## [1] 115 116 117 118 119 120 139 140 141 142 143 144 163 164 165
## [16] 166 167 168 235 236 237 238 239 240 259 260 261 262 263 264
## [31] 265 266 267 268 269 270 271 272 273 274 275 276 331 332 333
## [46] 334 335 336 463 464 465 466 467 468 541 542 543 544 545 546
## [61] 643 644 645 646 647 648 787 788 789 790 791 792 793 794 795
## [76] 796 797 798 799 800 801 802 803 804 811 812 813 814 815 816
## [91] 955 956 957 958 959 960 973 974 975 976 977 978 1033 1034 1035
## [106] 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050
## [121] 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080
##
## $`SchizophreniaPrevalence(%)`
## [1] 115 116 117 118 119 120 139 140 141 142 143 144 163 164 165
## [16] 166 167 168 235 236 237 238 239 240 259 260 261 262 263 264
## [31] 265 266 267 268 269 270 271 272 273 274 275 276 331 332 333
## [46] 334 335 336 463 464 465 466 467 468 541 542 543 544 545 546
## [61] 643 644 645 646 647 648 787 788 789 790 791 792 793 794 795
## [76] 796 797 798 799 800 801 802 803 804 811 812 813 814 815 816
## [91] 955 956 957 958 959 960 973 974 975 976 977 978 1033 1034 1035
## [106] 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050
## [121] 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080
MentalHealth[468,] # let's check why is there a missing value. Iran does exists in different datasets as well
filter(dep, grepl('Iran', Country)) # As we can see there is data for Iran.
life$Country <- recode(life$Country, "'Bolivia (Plurinational State of)'='Bolivia'; 'Brunei Darussalam'='Brunei' ;'Cabo Verde'='Cape Verde' ; 'Czechia'='Czech Republic' ; 'Democratic People s Republic of Korea'= 'North Korea' ; 'Democratic Republic of the Congo' = 'Congo' ; 'Iran (Islamic Republic of)'='Iran' ; 'Republic of Korea'='South Korea'; 'Republic of Moldova'='Moldova'; 'Republic of North Macedonia'='Macedonia' ; 'Russian Federation'='Russia' ; 'Syrian Arab Republic'='Syria' ; 'Timor-Leste' = 'Timor'; 'United Kingdom of Great Britain and Northern Ireland'= 'United Kingdom'; 'United Republic of Tanzania'='Tanzania';'United States of America'='United States'; 'Venezuela (Bolivarian Republic of)'='Venezuela'; 'Viet Nam'='Vietnam'") #Changed the names of the unmatched countries.
MentalHealth <- life %>% left_join(dep, by = c("Country", "Year","Gender")) %>% left_join(anxiety, by = c("Country", "Year","Gender")) %>% left_join(bipolar, by = c("Country", "Year","Gender")) %>% left_join(schizo, by = c("Country", "Year","Gender")) # re-merged the data after changing the names
is.notanumber <- function(x){ if (is.numeric(x)) sum(is.nan(x)) } # function to check nan.
sapply(MentalHealth, is.notanumber) #checking for not a number
## $Country
## NULL
##
## $Year
## [1] 0
##
## $Gender
## NULL
##
## $LifeExpBirth
## [1] 0
##
## $LifeExpSixty
## [1] 0
##
## $`DepressionPrevalence(%)`
## [1] 0
##
## $`AnxietyPrevalence(%)`
## [1] 0
##
## $`bipolarPrevalence(%)`
## [1] 0
##
## $`SchizophreniaPrevalence(%)`
## [1] 0
apply(MentalHealth, 2, function(col)sum(is.na(col))/length(col))
## Country Year
## 0.000000000 0.000000000
## Gender LifeExpBirth
## 0.000000000 0.003642987
## LifeExpSixty DepressionPrevalence(%)
## 0.003642987 0.027322404
## AnxietyPrevalence(%) bipolarPrevalence(%)
## 0.027322404 0.027322404
## SchizophreniaPrevalence(%)
## 0.027322404
sapply(MentalHealth, function(x) sum(is.na(x)))
## Country Year
## 0 0
## Gender LifeExpBirth
## 0 4
## LifeExpSixty DepressionPrevalence(%)
## 4 30
## AnxietyPrevalence(%) bipolarPrevalence(%)
## 30 30
## SchizophreniaPrevalence(%)
## 30
MentalHealth <- na.omit(MentalHealth) #dropping the remaining missing values as it is less than 5%.
dim(MentalHealth)
## [1] 1064 9
DepressionPrevalence(%), AnxietyPrevalence(%), bipolarPrevalence(%), SchizophreniaPrevalence(%) using rowSums() function and mutated the column Total Mental Health(%) to Mental health data.MentalHealth <- MentalHealth %>% mutate( `Total Mental Health(%)` = rowSums(.[6:9])) #mutatng the total mental health(%)
head(MentalHealth) #REQUIRMENT 7
AnxietyPrevalence(%), DepressionPrevalence(%), bipolarPrevalence(%) and SchizophreniaPrevalence(%). We found the length of outliers in each variable using length().# This is the R chunk for the Scan II
par(mfrow=c(2,3))
birth_out <- boxplot(MentalHealth$LifeExpBirth, main="Life Expectancy at Birth" , ylab="LifeExpBirth (year)")$out
sixty_out <- boxplot(MentalHealth$LifeExpSixty, main="Life Expectancy at age Sixty" , ylab="LifeExpSixty (year)")$out
anxiety_out <- boxplot(MentalHealth$`AnxietyPrevalence(%)`, main="Anxiety" , ylab="Anxiety Prevelance(%)")$out
depression_out <-boxplot(MentalHealth$`DepressionPrevalence(%)`, main="Depression" , ylab="Depression Prevelance(%)")$out
bipolar_out <-boxplot(MentalHealth$`bipolarPrevalence(%)`, main="bipolar Disorder" , ylab="bipolar Prevelance(%)")$out
schizophrenia_out <-boxplot(MentalHealth$`SchizophreniaPrevalence(%)`, main="Schizophrenia" , ylab="Schizophrenia Prevelance(%)")$out
length(birth_out)
## [1] 0
length(sixty_out)
## [1] 0
length(anxiety_out)
## [1] 39
anxiety_out # Some countries might have extreme cases of anxiety. These is not a bad outlier.
## [1] 8.853074 8.856810 8.860579 8.205766 8.199408 8.194722 8.137634
## [8] 8.046425 7.944376 8.866324 8.868778 8.871199 9.010868 9.004658
## [15] 8.999129 8.717204 8.713704 8.708997 8.609892 8.556644 8.484444
## [22] 8.241013 8.276979 8.318617 10.827724 10.825345 10.822762 9.365674
## [29] 9.363173 9.360701 7.856915 7.859839 7.862461 8.451632 8.440322
## [36] 8.430969 8.869611 8.874364 8.880176
length(depression_out)
## [1] 3
depression_out
## [1] 6.550159 6.499087 6.437784
length(bipolar_out)
## [1] 15
bipolar_out
## [1] 1.208779 1.208210 1.207600 1.158861 1.158718 1.158512 1.295179 1.294680
## [9] 1.294133 1.204748 1.204971 1.205284 1.225202 1.225874 1.226644
length(schizophrenia_out)
## [1] 51
schizophrenia_out
## [1] 0.3450438 0.3846154 0.3449354 0.3838778 0.3448643 0.3830882 0.3065890
## [8] 0.3228597 0.3068700 0.3231400 0.3071955 0.3234707 0.3379108 0.3314448
## [15] 0.3389727 0.3324961 0.3401197 0.3336084 0.3058479 0.3494099 0.3062084
## [22] 0.3498227 0.3065130 0.3501777 0.3056385 0.3047483 0.3037160 0.3069527
## [29] 0.3039090 0.3001915 0.3610115 0.3747323 0.3588239 0.3711940 0.3563719
## [36] 0.3672442 0.3214907 0.3622047 0.3216563 0.3620049 0.3218903 0.3617764
## [43] 0.3063685 0.3068138 0.3073167 0.3211138 0.3511215 0.3219071 0.3490141
## [50] 0.3231257 0.3463629
z1 <- MentalHealth$`AnxietyPrevalence(%)` %>% scores(type = "z") #Found the z-score for Anxiety variable
z1 %>% summary() # ranging from -1.5 to 4.24
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.5520 -0.8601 -0.2169 0.0000 0.4361 4.2441
length (which( abs(z1) >3 )) # 18 outliers
## [1] 18
which( abs(z1) >3 )
## [1] 31 33 35 199 201 203 337 339 341 667 669 671 691 693 695
## [16] 1017 1019 1021
z2 <- MentalHealth$`DepressionPrevalence(%)` %>% scores(type = "z") #Found the z-score for Depression variable
z2 %>% summary() # ranging from -1.8 to 3.3
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.8967 -0.7546 -0.2107 0.0000 0.8179 3.3328
length (which( abs(z2) >3 )) #6 outliers
## [1] 6
which( abs(z2) >3 )
## [1] 535 537 539 631 633 635
MentalHealth[535,]
z3 <- MentalHealth$`bipolarPrevalence(%)` %>% scores(type = "z") #Found the z-score for bipolar variable
z3 %>% summary()# ranging from -2.5 to 3.4
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.5568 -0.6725 -0.2364 0.0000 0.6495 3.4837
length (which( abs(z3) >3 )) #6 outliers
## [1] 6
z4 <- MentalHealth$`SchizophreniaPrevalence(%)` %>% scores(type = "z") #Found the z-score for Schizophrenia variable
z4 %>% summary() # ranging from -1.4 to 4.1
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.4154 -0.6854 -0.2060 0.0000 0.4008 4.1067
length (which( abs(z4) >3 )) #24
## [1] 24
which( abs(z4) >3 )
## [1] 43 44 45 46 47 48 205 207 209 458 460 462 661 662 663
## [16] 664 665 666 668 670 672 1012 1014 1016
#Anxiety and Schizophrenia add more number of outliers lets check then data points.
#Visualizing the points of anxiety using ggplot().
ggplot(MentalHealth, aes(y=1:1064 , x=`AnxietyPrevalence(%)`)) + geom_point(aes( size = `AnxietyPrevalence(%)`))
#Visualizing the points of Schizophrenia using ggplot().
ggplot(MentalHealth, aes(y=1:1064 , x=`SchizophreniaPrevalence(%)`)) + geom_point(aes( size=`SchizophreniaPrevalence(%)`))
## Transform * In fututre analysis we would need normally distributed data. We plot histogra to check form normal ditribution using hist(). Then applied the appropriate transformation needed to achieve normalization. We can see that the LifeExpBirth is left-skewed. So, cubed the data to achieve normal distribution. We can see that the Anxiety is right-skewed. So, applied log() to the variable and then took cube root of the transformed data to get a normal distribution. * The normally distributed data are as follows.
MentalHealth <- MentalHealth %>% mutate(`Transformation of LifeExpBirth` = MentalHealth$LifeExpBirth^3)
MentalHealth <- MentalHealth %>% mutate(`Transformation of AnxietyPrevalence(%)` = (log(MentalHealth$`AnxietyPrevalence(%)`))^1/3)
par(mfrow = c(2,2))
hist(MentalHealth$`LifeExpBirth`, main = "Distribution of LifeExpBirth`", xlab= "LifeExpBirth in Years", breaks=20)
hist(MentalHealth$`Transformation of LifeExpBirth`, main = "Transformed LifeExpBirth`", xlab= "LifeExpBirth in Years", breaks=20)
hist(MentalHealth$`AnxietyPrevalence(%)`, main = "Distribution of AnxietyPrevalence(%)`", xlab= "Anxiety in percentage", breaks=20)
hist(MentalHealth$`Transformation of AnxietyPrevalence(%)`, main = " Transformed AnxietyPrevalence(%)`", xlab= "Anxiety in percentage", breaks=20)