## load table into R
library(data.table)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
url <- "https://raw.githubusercontent.com/folushoa/Data-Science/R-Project/Affairs.csv"
affairs_table <- fread(input = url, sep = ',', header = TRUE)
affairs_table %>% arrange(desc(affairs)) %>% head(20)
## V1 affairs gender age yearsmarried children religiousness education
## 1: 53 12 female 32.0 10.00 yes 3 17
## 2: 122 12 male 37.0 15.00 yes 4 14
## 3: 174 12 female 42.0 15.00 yes 5 9
## 4: 176 12 male 37.0 10.00 yes 2 20
## 5: 181 12 female 32.0 15.00 yes 3 14
## 6: 252 12 male 27.0 1.50 yes 3 17
## 7: 253 12 female 27.0 7.00 yes 4 14
## 8: 392 12 female 32.0 10.00 yes 2 16
## 9: 513 12 female 22.0 4.00 no 3 12
## 10: 516 12 male 27.0 7.00 yes 1 18
## 11: 520 12 male 47.0 15.00 yes 4 17
## 12: 526 12 female 42.0 15.00 yes 4 12
## 13: 625 12 male 37.0 15.00 yes 2 20
## 14: 951 12 male 32.0 7.00 yes 2 12
## 15: 975 12 female 37.0 15.00 yes 1 18
## 16: 1007 12 female 42.0 15.00 yes 4 14
## 17: 1056 12 female 42.0 15.00 yes 3 14
## 18: 1075 12 male 37.0 10.00 yes 2 20
## 19: 1080 12 female 27.0 7.00 yes 1 14
## 20: 1138 12 female 17.5 0.75 yes 2 12
## occupation rating
## 1: 5 2
## 2: 5 2
## 3: 4 1
## 4: 6 2
## 5: 1 2
## 6: 5 4
## 7: 6 2
## 8: 5 5
## 9: 3 4
## 10: 6 2
## 11: 6 5
## 12: 1 1
## 13: 5 4
## 14: 4 2
## 15: 5 5
## 16: 1 2
## 17: 4 3
## 18: 6 2
## 19: 3 3
## 20: 1 3
##summary table that describes the number of affairs by the number of males
##and females
affairs_by_gender <- affairs_table %>% select(Num_of_Affairs = affairs,
Gender = gender) %>% group_by(Num_of_Affairs, Gender) %>% summarize(Num_of_M_F = n())
## `summarise()` has grouped output by 'Num_of_Affairs'. You can override using
## the `.groups` argument.
head(affairs_by_gender, 15)
## # A tibble: 12 × 3
## # Groups: Num_of_Affairs [6]
## Num_of_Affairs Gender Num_of_M_F
## <int> <chr> <int>
## 1 0 female 243
## 2 0 male 208
## 3 1 female 15
## 4 1 male 19
## 5 2 female 7
## 6 2 male 10
## 7 3 female 8
## 8 3 male 11
## 9 7 female 22
## 10 7 male 20
## 11 12 female 20
## 12 12 male 18
ggplot(data = affairs_table) + geom_histogram(aes(x = affairs))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## From the table and graph above we see that most people do not have
affairs. However, the distribution of the number of male or females that
do have affairs is even, i.e. the number of males and felase that had
‘1’ affair is about the same as the number of male and females that had
‘12’ affairs. From the table, grouping by number of affairs, more males
had affairs than females, although not by much.
affairs_by_age <- affairs_table %>% select(affairs, age) %>% group_by(affairs) %>%
arrange(desc(age))
head(affairs_by_age, 10)
## # A tibble: 10 × 2
## # Groups: affairs [1]
## affairs age
## <int> <dbl>
## 1 0 57
## 2 0 57
## 3 0 57
## 4 0 57
## 5 0 57
## 6 0 57
## 7 0 57
## 8 0 57
## 9 0 57
## 10 0 57
ggplot(data = affairs_by_age, aes(x = age)) + geom_histogram() + facet_wrap(~affairs)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## From the graphs above, it can be concluded that the cluster of people
who ## have affairs are between the age of 20 and 50, with most being
between 25 ## and 40
affairs_by_num_of_children <- affairs_table %>% group_by(children, affairs) %>% summarize(Num_of_A = n())
## `summarise()` has grouped output by 'children'. You can override using the
## `.groups` argument.
head(affairs_by_num_of_children, 15)
## # A tibble: 12 × 3
## # Groups: children [2]
## children affairs Num_of_A
## <chr> <int> <int>
## 1 no 0 144
## 2 no 1 7
## 3 no 2 2
## 4 no 3 4
## 5 no 7 7
## 6 no 12 7
## 7 yes 0 307
## 8 yes 1 27
## 9 yes 2 15
## 10 yes 3 15
## 11 yes 7 35
## 12 yes 12 31
ggplot(data = affairs_by_num_of_children, aes(x = affairs, y = Num_of_A)) + geom_col(fill = 'green') + facet_wrap(~children)
## The plots above shows that it was mostly people with children that
had affairs. ## However, we also see that the most people who had no
affairs also had children. ## Although from this we can say that having
children and having affairs has ## no coorelation, I would argue that
since more people who had affairs also had ## children I believe that
their is a coorelation between having children and ## having
affairs.
affairs_by_years_married <- affairs_table %>%
select(Num_of_Affairs = affairs, Years_Married = yearsmarried) %>%
group_by(Num_of_Affairs, Years_Married) %>% summarize(Num_of_Instance = n()) %>%
arrange(desc(Years_Married))
## `summarise()` has grouped output by 'Num_of_Affairs'. You can override using
## the `.groups` argument.
affairs_by_years_married %>% summary()
## Num_of_Affairs Years_Married Num_of_Instance
## Min. : 0.000 Min. : 0.125 Min. : 1.00
## 1st Qu.: 1.000 1st Qu.: 1.500 1st Qu.: 2.75
## Median : 2.000 Median : 5.500 Median : 6.00
## Mean : 3.833 Mean : 6.301 Mean : 16.69
## 3rd Qu.: 7.000 3rd Qu.:10.000 3rd Qu.: 9.25
## Max. :12.000 Max. :15.000 Max. :142.00
ggplot(affairs_by_years_married, aes(x = Num_of_Affairs, y = Num_of_Instance)) +
geom_point(color = 'blue') + facet_wrap(~Years_Married)
## From the point plot, as the number of years married increases so does
the ## number of number of affairs. After 4 years it seams that the
number of affars ## is the same. This is confirmed from the mean of
number of affairs being 3.833.