#task 1
importing the data
mydata <-force(swiss) #reading the dataset, saved in R
print(mydata) #display the table of data
## Fertility Agriculture Examination Education Catholic
## Courtelary 80.2 17.0 15 12 9.96
## Delemont 83.1 45.1 6 9 84.84
## Franches-Mnt 92.5 39.7 5 5 93.40
## Moutier 85.8 36.5 12 7 33.77
## Neuveville 76.9 43.5 17 15 5.16
## Porrentruy 76.1 35.3 9 7 90.57
## Broye 83.8 70.2 16 7 92.85
## Glane 92.4 67.8 14 8 97.16
## Gruyere 82.4 53.3 12 7 97.67
## Sarine 82.9 45.2 16 13 91.38
## Veveyse 87.1 64.5 14 6 98.61
## Aigle 64.1 62.0 21 12 8.52
## Aubonne 66.9 67.5 14 7 2.27
## Avenches 68.9 60.7 19 12 4.43
## Cossonay 61.7 69.3 22 5 2.82
## Echallens 68.3 72.6 18 2 24.20
## Grandson 71.7 34.0 17 8 3.30
## Lausanne 55.7 19.4 26 28 12.11
## La Vallee 54.3 15.2 31 20 2.15
## Lavaux 65.1 73.0 19 9 2.84
## Morges 65.5 59.8 22 10 5.23
## Moudon 65.0 55.1 14 3 4.52
## Nyone 56.6 50.9 22 12 15.14
## Orbe 57.4 54.1 20 6 4.20
## Oron 72.5 71.2 12 1 2.40
## Payerne 74.2 58.1 14 8 5.23
## Paysd'enhaut 72.0 63.5 6 3 2.56
## Rolle 60.5 60.8 16 10 7.72
## Vevey 58.3 26.8 25 19 18.46
## Yverdon 65.4 49.5 15 8 6.10
## Conthey 75.5 85.9 3 2 99.71
## Entremont 69.3 84.9 7 6 99.68
## Herens 77.3 89.7 5 2 100.00
## Martigwy 70.5 78.2 12 6 98.96
## Monthey 79.4 64.9 7 3 98.22
## St Maurice 65.0 75.9 9 9 99.06
## Sierre 92.2 84.6 3 3 99.46
## Sion 79.3 63.1 13 13 96.83
## Boudry 70.4 38.4 26 12 5.62
## La Chauxdfnd 65.7 7.7 29 11 13.79
## Le Locle 72.7 16.7 22 13 11.22
## Neuchatel 64.4 17.6 35 32 16.92
## Val de Ruz 77.6 37.6 15 7 4.97
## ValdeTravers 67.6 18.7 25 7 8.65
## V. De Geneve 35.0 1.2 37 53 42.34
## Rive Droite 44.7 46.6 16 29 50.43
## Rive Gauche 42.8 27.7 22 29 58.33
## Infant.Mortality
## Courtelary 22.2
## Delemont 22.2
## Franches-Mnt 20.2
## Moutier 20.3
## Neuveville 20.6
## Porrentruy 26.6
## Broye 23.6
## Glane 24.9
## Gruyere 21.0
## Sarine 24.4
## Veveyse 24.5
## Aigle 16.5
## Aubonne 19.1
## Avenches 22.7
## Cossonay 18.7
## Echallens 21.2
## Grandson 20.0
## Lausanne 20.2
## La Vallee 10.8
## Lavaux 20.0
## Morges 18.0
## Moudon 22.4
## Nyone 16.7
## Orbe 15.3
## Oron 21.0
## Payerne 23.8
## Paysd'enhaut 18.0
## Rolle 16.3
## Vevey 20.9
## Yverdon 22.5
## Conthey 15.1
## Entremont 19.8
## Herens 18.3
## Martigwy 19.4
## Monthey 20.2
## St Maurice 17.8
## Sierre 16.3
## Sion 18.1
## Boudry 20.3
## La Chauxdfnd 20.5
## Le Locle 18.9
## Neuchatel 23.0
## Val de Ruz 20.0
## ValdeTravers 19.5
## V. De Geneve 18.0
## Rive Droite 18.2
## Rive Gauche 19.3
#install.packages("dplyr")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
glimpse(mydata)
## Rows: 47
## Columns: 6
## $ Fertility <dbl> 80.2, 83.1, 92.5, 85.8, 76.9, 76.1, 83.8, 92.4, 82.4,…
## $ Agriculture <dbl> 17.0, 45.1, 39.7, 36.5, 43.5, 35.3, 70.2, 67.8, 53.3,…
## $ Examination <int> 15, 6, 5, 12, 17, 9, 16, 14, 12, 16, 14, 21, 14, 19, …
## $ Education <int> 12, 9, 5, 7, 15, 7, 7, 8, 7, 13, 6, 12, 7, 12, 5, 2, …
## $ Catholic <dbl> 9.96, 84.84, 93.40, 33.77, 5.16, 90.57, 92.85, 97.16,…
## $ Infant.Mortality <dbl> 22.2, 22.2, 20.2, 20.3, 20.6, 26.6, 23.6, 24.9, 21.0,…
Description of the variables: Fertility is a measure of fertility level in the region, number of live births per 1000 women Agriculture is the percentage of males involved in agriculture Examination is the percentage of draftees receiving highest mark on army examination Education is percentage of education gotten beyond primary school for draftees Catholic is percentage of catholics Infant mortality is the percentage of live births who live less than 1 year
mydata2 <-mydata[1:10, c(1,6)] #selection of the 1st and 5th column and rows from 1st to 10th
print(mydata2)
## Fertility Infant.Mortality
## Courtelary 80.2 22.2
## Delemont 83.1 22.2
## Franches-Mnt 92.5 20.2
## Moutier 85.8 20.3
## Neuveville 76.9 20.6
## Porrentruy 76.1 26.6
## Broye 83.8 23.6
## Glane 92.4 24.9
## Gruyere 82.4 21.0
## Sarine 82.9 24.4
mydata2[6, 1] <- 75 #changed individual value in the data frame
print(mydata2)
## Fertility Infant.Mortality
## Courtelary 80.2 22.2
## Delemont 83.1 22.2
## Franches-Mnt 92.5 20.2
## Moutier 85.8 20.3
## Neuveville 76.9 20.6
## Porrentruy 75.0 26.6
## Broye 83.8 23.6
## Glane 92.4 24.9
## Gruyere 82.4 21.0
## Sarine 82.9 24.4
#adding a quantitative variable Access to healthcare to existing data frame
mydata2$'Access to healthcare' <- factor(c("1","1","1","0","0","1","1","1","0","1"))
print(mydata2)
## Fertility Infant.Mortality Access to healthcare
## Courtelary 80.2 22.2 1
## Delemont 83.1 22.2 1
## Franches-Mnt 92.5 20.2 1
## Moutier 85.8 20.3 0
## Neuveville 76.9 20.6 0
## Porrentruy 75.0 26.6 1
## Broye 83.8 23.6 1
## Glane 92.4 24.9 1
## Gruyere 82.4 21.0 0
## Sarine 82.9 24.4 1
Explanation of the variable: - Access to healthcare: 0:Poor, 1:Good
mydata2$`Access to healthcare` <- factor(mydata2$`Access to healthcare`,
levels = c(0, 1),
labels = c("P","G"))
#converting numeric codes into labels
print(mydata2)
## Fertility Infant.Mortality Access to healthcare
## Courtelary 80.2 22.2 G
## Delemont 83.1 22.2 G
## Franches-Mnt 92.5 20.2 G
## Moutier 85.8 20.3 P
## Neuveville 76.9 20.6 P
## Porrentruy 75.0 26.6 G
## Broye 83.8 23.6 G
## Glane 92.4 24.9 G
## Gruyere 82.4 21.0 P
## Sarine 82.9 24.4 G
summary(mydata[,-7]) #summery of the dataset excluding the variable i added
## Fertility Agriculture Examination Education
## Min. :35.00 Min. : 1.20 Min. : 3.00 Min. : 1.00
## 1st Qu.:64.70 1st Qu.:35.90 1st Qu.:12.00 1st Qu.: 6.00
## Median :70.40 Median :54.10 Median :16.00 Median : 8.00
## Mean :70.14 Mean :50.66 Mean :16.49 Mean :10.98
## 3rd Qu.:78.45 3rd Qu.:67.65 3rd Qu.:22.00 3rd Qu.:12.00
## Max. :92.50 Max. :89.70 Max. :37.00 Max. :53.00
## Catholic Infant.Mortality
## Min. : 2.150 Min. :10.80
## 1st Qu.: 5.195 1st Qu.:18.15
## Median : 15.140 Median :20.00
## Mean : 41.144 Mean :19.94
## 3rd Qu.: 93.125 3rd Qu.:21.70
## Max. :100.000 Max. :26.60
Explanation of the variables: 1.The average percentage of Catholics in 47 provinces of Switzerland is 41.1% 2.Half of the Swiss provinces had 8% or fewer of draftees receiving education beyond primary school and the other half of provinces had more 3.The minimum percentage of males involved in agriculture as occupation is 1.2% `
#install.packages("ggplot2")
mydata$Province <- rownames(mydata) #adding Province names
library(ggplot2)
swiss_10 <- mydata[1:10,] #to represent first 10 provinces
ggplot(swiss_10, aes(x = reorder(Province, Education), y = Education)) +
geom_bar(stat = "identity", fill = "darkseagreen", color = "darkorchid2") +
labs(title = "Education by Province in Switzerland",
x = "Province", y = "Education (%)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) #to align and rotate labels on x line
Description of the bar plot: Among the first ten french speaking provinces we observed, the province Neuveville has the highest percentage of draftees that got education beyond primary school and is equal to 15%. Meanwhile province Franches-Montagnes has the lowest percentage of draftees with education beyond primary, at 5%
#Relationship between fertility and Infant.Mortality
library(ggplot2)
ggplot(mydata, aes(x = Fertility, y = Infant.Mortality)) +
geom_point(color = "cornflowerblue", size = 1.5) +
labs(title = "Scatterplot")+
theme_gray()
cor(swiss$Fertility, swiss$Infant.Mortality) #Pearson correlation
## [1] 0.416556
Description of the scatter plot: The scatter plot shows positive linear relationship between 2 variables such are Fertility (x) and Infant. Mortality (y). Pearson correlation is >0 and is semi strong meaning that as one variable increase the other tends to increase too