The data is about the health and income outcomes for 184 countries from 1960 to 2016.
# read csv file and assign it to mydata
mydata <- read.csv("gapminder.csv")
# By default the head() function shows 6 rows but we can specify the desired number of rows with n argument
head(mydata, n = 10, )
# pull summary from the data set and assign it to mydata_summary
mydata_summary <- summary(mydata)
print(mydata_summary)
## X country year infant_mortality
## Min. : 1 Length:10545 Min. :1960 Min. : 1.50
## 1st Qu.: 2637 Class :character 1st Qu.:1974 1st Qu.: 16.00
## Median : 5273 Mode :character Median :1988 Median : 41.50
## Mean : 5273 Mean :1988 Mean : 55.31
## 3rd Qu.: 7909 3rd Qu.:2002 3rd Qu.: 85.10
## Max. :10545 Max. :2016 Max. :276.90
## NA's :1453
## life_expectancy fertility population gdp
## Min. :13.20 Min. :0.840 Min. :3.124e+04 Min. :4.040e+07
## 1st Qu.:57.50 1st Qu.:2.200 1st Qu.:1.333e+06 1st Qu.:1.846e+09
## Median :67.54 Median :3.750 Median :5.009e+06 Median :7.794e+09
## Mean :64.81 Mean :4.084 Mean :2.701e+07 Mean :1.480e+11
## 3rd Qu.:73.00 3rd Qu.:6.000 3rd Qu.:1.523e+07 3rd Qu.:5.540e+10
## Max. :83.90 Max. :9.220 Max. :1.376e+09 Max. :1.174e+13
## NA's :187 NA's :185 NA's :2972
## continent region
## Length:10545 Length:10545
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
# calculate the mean and median of life_expectancy
mean_life_expect <- mean(mydata[["life_expectancy"]])
mean_life_expect
## [1] 64.81162
median_life_expect <- median(mydata[["life_expectancy"]])
median_life_expect
## [1] 67.54
new_data <- subset(mydata, infant_mortality != "NA", select = c("year", "infant_mortality", "gdp", "life_expectancy"))
new_data
new_data$cleaned_infant_mortality <- new_data$infant_mortality
new_data <- subset(new_data, select = -infant_mortality)
new_data
new_summary <- summary(new_data)
new_summary
## year gdp life_expectancy cleaned_infant_mortality
## Min. :1960 Min. :4.040e+07 Min. :13.20 Min. : 1.50
## 1st Qu.:1977 1st Qu.:1.974e+09 1st Qu.:57.24 1st Qu.: 16.00
## Median :1990 Median :8.249e+09 Median :67.58 Median : 41.50
## Mean :1989 Mean :1.556e+11 Mean :64.86 Mean : 55.31
## 3rd Qu.:2003 3rd Qu.:5.843e+10 3rd Qu.:73.18 3rd Qu.: 85.10
## Max. :2015 Max. :1.174e+13 Max. :83.30 Max. :276.90
## NA's :1953
# lets start with checking NA values in fertility
naInFertility <- sum(is.na(mydata$fertility))
# find mean value of fertility rate excluding the NA
mean_fertility <- mean(mydata$fertility, na.rm = TRUE)
# replace NA with mean value
mydata$fertility <- replace(mydata$fertility, is.na(mydata$fertility), mean_fertility)
mydata
# replace Australia with country of Kangaroo
mydata$country[mydata$country == 'Australia'] <- 'Country_kangaroo'
# Replace fertility > than 7 with "healthAtRisk"
mydata$fertility[mydata$fertility > 7] <- 'healthAtRisk'
mydata
head(mydata, n = 25)