Project

What factors contribute to how frequently a person as an affair? Using the

loaded table we shall see if there is any coorelation between the frequency of

affairs and gender, age, years married, and whether they have children or not

## load table into R
library(data.table)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

url <- "https://raw.githubusercontent.com/folushoa/Data-Science/R-Project/Affairs.csv"
affairs_table <- fread(input = url, sep = ',', header = TRUE)

affairs_table %>% arrange(desc(affairs)) %>% head(20)
##       V1 affairs gender  age yearsmarried children religiousness education
##  1:   53      12 female 32.0        10.00      yes             3        17
##  2:  122      12   male 37.0        15.00      yes             4        14
##  3:  174      12 female 42.0        15.00      yes             5         9
##  4:  176      12   male 37.0        10.00      yes             2        20
##  5:  181      12 female 32.0        15.00      yes             3        14
##  6:  252      12   male 27.0         1.50      yes             3        17
##  7:  253      12 female 27.0         7.00      yes             4        14
##  8:  392      12 female 32.0        10.00      yes             2        16
##  9:  513      12 female 22.0         4.00       no             3        12
## 10:  516      12   male 27.0         7.00      yes             1        18
## 11:  520      12   male 47.0        15.00      yes             4        17
## 12:  526      12 female 42.0        15.00      yes             4        12
## 13:  625      12   male 37.0        15.00      yes             2        20
## 14:  951      12   male 32.0         7.00      yes             2        12
## 15:  975      12 female 37.0        15.00      yes             1        18
## 16: 1007      12 female 42.0        15.00      yes             4        14
## 17: 1056      12 female 42.0        15.00      yes             3        14
## 18: 1075      12   male 37.0        10.00      yes             2        20
## 19: 1080      12 female 27.0         7.00      yes             1        14
## 20: 1138      12 female 17.5         0.75      yes             2        12
##     occupation rating
##  1:          5      2
##  2:          5      2
##  3:          4      1
##  4:          6      2
##  5:          1      2
##  6:          5      4
##  7:          6      2
##  8:          5      5
##  9:          3      4
## 10:          6      2
## 11:          6      5
## 12:          1      1
## 13:          5      4
## 14:          4      2
## 15:          5      5
## 16:          1      2
## 17:          4      3
## 18:          6      2
## 19:          3      3
## 20:          1      3

Observation 1: Who has more affairs, male or females?

##summary table that describes the number of affairs by the number of males
##and females

affairs_by_gender <- affairs_table %>% select(Num_of_Affairs = affairs, 
  Gender = gender) %>% group_by(Num_of_Affairs, Gender) %>% summarize(Num_of_M_F = n())
## `summarise()` has grouped output by 'Num_of_Affairs'. You can override using
## the `.groups` argument.
head(affairs_by_gender, 15)
## # A tibble: 12 × 3
## # Groups:   Num_of_Affairs [6]
##    Num_of_Affairs Gender Num_of_M_F
##             <int> <chr>       <int>
##  1              0 female        243
##  2              0 male          208
##  3              1 female         15
##  4              1 male           19
##  5              2 female          7
##  6              2 male           10
##  7              3 female          8
##  8              3 male           11
##  9              7 female         22
## 10              7 male           20
## 11             12 female         20
## 12             12 male           18
ggplot(data = affairs_table) + geom_histogram(aes(x = affairs))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## From the table and graph above we see that most people do not have affairs. However, the distribution of the number of male or females that do have affairs is even, i.e. the number of males and felase that had ‘1’ affair is about the same as the number of male and females that had ‘12’ affairs. From the table, grouping by number of affairs, more males had affairs than females, although not by much.

Observation 2: What age group have affairs?

affairs_by_age <- affairs_table %>% select(affairs, age) %>% group_by(affairs) %>% 
  arrange(desc(age))
head(affairs_by_age, 10)
## # A tibble: 10 × 2
## # Groups:   affairs [1]
##    affairs   age
##      <int> <dbl>
##  1       0    57
##  2       0    57
##  3       0    57
##  4       0    57
##  5       0    57
##  6       0    57
##  7       0    57
##  8       0    57
##  9       0    57
## 10       0    57
ggplot(data = affairs_by_age, aes(x = age)) + geom_histogram() + facet_wrap(~affairs)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## From the graphs above, it can be concluded that the cluster of people who ## have affairs are between the age of 20 and 50, with most being between 25 ## and 40

Observation 3: Does having children lead to having affairs?

affairs_by_num_of_children <- affairs_table %>% group_by(children, affairs) %>% summarize(Num_of_A = n())
## `summarise()` has grouped output by 'children'. You can override using the
## `.groups` argument.
head(affairs_by_num_of_children, 15)
## # A tibble: 12 × 3
## # Groups:   children [2]
##    children affairs Num_of_A
##    <chr>      <int>    <int>
##  1 no             0      144
##  2 no             1        7
##  3 no             2        2
##  4 no             3        4
##  5 no             7        7
##  6 no            12        7
##  7 yes            0      307
##  8 yes            1       27
##  9 yes            2       15
## 10 yes            3       15
## 11 yes            7       35
## 12 yes           12       31
ggplot(data = affairs_by_num_of_children, aes(x = affairs, y = Num_of_A)) + geom_col(fill = 'green') + facet_wrap(~children)

## The plots above shows that it was mostly people with children that had affairs. ## However, we also see that the most people who had no affairs also had children. ## Although from this we can say that having children and having affairs has ## no coorelation, I would argue that since more people who had affairs also had ## children I believe that their is a coorelation between having children and ## having affairs.

Observation 4:Is the number of years married directly proportional to the number

of affairs?

affairs_by_years_married <- affairs_table %>% 
  select(Num_of_Affairs = affairs, Years_Married = yearsmarried) %>% 
  group_by(Num_of_Affairs, Years_Married) %>% summarize(Num_of_Instance = n()) %>%
  arrange(desc(Years_Married)) 
## `summarise()` has grouped output by 'Num_of_Affairs'. You can override using
## the `.groups` argument.
affairs_by_years_married %>% summary()
##  Num_of_Affairs   Years_Married    Num_of_Instance 
##  Min.   : 0.000   Min.   : 0.125   Min.   :  1.00  
##  1st Qu.: 1.000   1st Qu.: 1.500   1st Qu.:  2.75  
##  Median : 2.000   Median : 5.500   Median :  6.00  
##  Mean   : 3.833   Mean   : 6.301   Mean   : 16.69  
##  3rd Qu.: 7.000   3rd Qu.:10.000   3rd Qu.:  9.25  
##  Max.   :12.000   Max.   :15.000   Max.   :142.00
ggplot(affairs_by_years_married, aes(x = Num_of_Affairs, y = Num_of_Instance)) + 
  geom_point(color = 'blue') + facet_wrap(~Years_Married)

## From the point plot, as the number of years married increases so does the ## number of number of affairs. After 4 years it seams that the number of affars ## is the same. This is confirmed from the mean of number of affairs being 3.833.