library(RCurl)
## Loading required package: bitops
# load the required package
library(ggplot2)
df <- read.csv(url("https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/Stat2Data/Fertility.csv") , header = TRUE)
summary(df)
## X Age LowAFC MeanAFC
## Min. : 1 Min. :21.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 84 1st Qu.:32.00 1st Qu.: 7.00 1st Qu.: 8.00
## Median :167 Median :35.00 Median :11.00 Median :12.00
## Mean :167 Mean :35.33 Mean :12.29 Mean :13.53
## 3rd Qu.:250 3rd Qu.:39.00 3rd Qu.:15.00 3rd Qu.:17.00
## Max. :333 Max. :46.00 Max. :41.00 Max. :51.50
## FSH E2 MaxE2 MaxDailyGn
## Min. : 0.500 Min. :13.00 Min. : 290 Min. :100.0
## 1st Qu.: 4.600 1st Qu.:30.00 1st Qu.: 994 1st Qu.:225.0
## Median : 5.700 Median :39.00 Median :1443 Median :300.0
## Mean : 5.935 Mean :41.25 Mean :1546 Mean :310.8
## 3rd Qu.: 6.900 3rd Qu.:52.00 3rd Qu.:1856 3rd Qu.:450.0
## Max. :16.000 Max. :90.00 Max. :6242 Max. :525.0
## TotalGn Oocytes Embryos
## Min. : 825 Min. : 1.00 Min. : 0.000
## 1st Qu.:1675 1st Qu.: 7.00 1st Qu.: 4.000
## Median :2550 Median :11.00 Median : 6.000
## Mean :2831 Mean :11.84 Mean : 6.727
## 3rd Qu.:3962 3rd Qu.:15.00 3rd Qu.: 9.000
## Max. :7275 Max. :35.00 Max. :23.000
str(df)
## 'data.frame': 333 obs. of 11 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : int 40 37 40 40 30 29 31 33 36 35 ...
## $ LowAFC : int 40 41 38 36 36 35 24 28 30 32 ...
## $ MeanAFC : num 51.5 41 41 37.5 36 35 35 34 33 32 ...
## $ FSH : num 5.3 7.1 4.9 3.9 4 3.9 3.8 4.3 4.9 3.7 ...
## $ E2 : int 45 53 40 26 49 67 49 20 60 36 ...
## $ MaxE2 : int 1427 802 4533 1804 2526 3812 1087 1615 1879 2009 ...
## $ MaxDailyGn: num 300 225 450 300 150 ...
## $ TotalGn : num 2700 1800 4850 2700 1500 ...
## $ Oocytes : int 25 7 27 9 19 19 13 15 23 26 ...
## $ Embryos : int 13 6 15 4 12 16 9 9 10 8 ...
head(df)
## X Age LowAFC MeanAFC FSH E2 MaxE2 MaxDailyGn TotalGn Oocytes Embryos
## 1 1 40 40 51.5 5.3 45 1427 300 2700 25 13
## 2 2 37 41 41.0 7.1 53 802 225 1800 7 6
## 3 3 40 38 41.0 4.9 40 4533 450 4850 27 15
## 4 4 40 36 37.5 3.9 26 1804 300 2700 9 4
## 5 5 30 36 36.0 4.0 49 2526 150 1500 19 12
## 6 6 29 35 35.0 3.9 67 3812 150 975 19 16
sub_df <- subset(df, Age >= 32 & Oocytes > 20 & Embryos > 10)
head(sub_df, 30)
## X Age LowAFC MeanAFC FSH E2 MaxE2 MaxDailyGn TotalGn Oocytes Embryos
## 1 1 40 40 51.5 5.3 45 1427 300 2700.0 25 13
## 3 3 40 38 41.0 4.9 40 4533 450 4850.0 27 15
## 12 12 39 32 32.0 5.3 37 1291 300 2700.0 22 18
## 19 19 37 18 27.0 5.7 46 2427 225 2300.0 25 15
## 22 22 36 24 27.0 4.8 22 2465 150 1212.5 21 11
## 24 24 34 26 26.0 5.8 20 2980 225 1387.5 26 14
## 42 42 39 21 21.0 2.4 20 1966 300 2100.0 30 21
## 46 46 34 20 20.0 4.7 61 1726 225 1950.0 29 16
## 61 61 33 19 19.0 4.6 55 1668 150 1087.5 23 20
## 66 66 41 17 18.0 6.3 34 1856 300 2175.0 23 12
## 87 87 40 8 16.5 6.9 81 1634 375 3525.0 23 11
## 97 97 33 14 16.0 4.6 35 1985 225 1688.0 28 17
## 137 137 32 13 13.0 7.8 29 1412 450 4050.0 26 20
## 142 142 35 13 13.0 6.4 45 2044 375 3675.0 35 23
## 241 241 34 9 9.0 5.0 44 3349 225 1575.0 21 22
## 308 308 43 6 6.0 4.9 33 1838 300 3263.0 26 13
# rename columns
names(sub_df) <- c("Record", "age", "low_AFC", "mean_AFC", "FSH", "E2", "max_E2", "max_D_gn", "total_gn", "oocytes", "embryos")
head(sub_df)
## Record age low_AFC mean_AFC FSH E2 max_E2 max_D_gn total_gn oocytes
## 1 1 40 40 51.5 5.3 45 1427 300 2700.0 25
## 3 3 40 38 41.0 4.9 40 4533 450 4850.0 27
## 12 12 39 32 32.0 5.3 37 1291 300 2700.0 22
## 19 19 37 18 27.0 5.7 46 2427 225 2300.0 25
## 22 22 36 24 27.0 4.8 22 2465 150 1212.5 21
## 24 24 34 26 26.0 5.8 20 2980 225 1387.5 26
## embryos
## 1 13
## 3 15
## 12 18
## 19 15
## 22 11
## 24 14
# replace values of specific age into reference of the E2 value
sub_df$age[sub_df$age == 34] <- "Lw_E2"
sub_df$age[sub_df$age == 40] <- "H_E2"
head(sub_df)
## Record age low_AFC mean_AFC FSH E2 max_E2 max_D_gn total_gn oocytes
## 1 1 H_E2 40 51.5 5.3 45 1427 300 2700.0 25
## 3 3 H_E2 38 41.0 4.9 40 4533 450 4850.0 27
## 12 12 39 32 32.0 5.3 37 1291 300 2700.0 22
## 19 19 37 18 27.0 5.7 46 2427 225 2300.0 25
## 22 22 36 24 27.0 4.8 22 2465 150 1212.5 21
## 24 24 Lw_E2 26 26.0 5.8 20 2980 225 1387.5 26
## embryos
## 1 13
## 3 15
## 12 18
## 19 15
## 22 11
## 24 14
# delete unused data from the table
sub_df <- sub_df[, -(4:5), drop = FALSE]
sub_df <- sub_df[, -(5:7), drop = FALSE]
head(sub_df)
## Record age low_AFC E2 oocytes embryos
## 1 1 H_E2 40 45 25 13
## 3 3 H_E2 38 40 27 15
## 12 12 39 32 37 22 18
## 19 19 37 18 46 25 15
## 22 22 36 24 22 21 11
## 24 24 Lw_E2 26 20 26 14
summary(sub_df)
## Record age low_AFC E2
## Min. : 1.00 Length:16 Min. : 6.00 Min. :20.00
## 1st Qu.: 21.25 Class :character 1st Qu.:13.00 1st Qu.:32.00
## Median : 53.50 Mode :character Median :18.50 Median :38.50
## Mean : 81.75 Mean :19.88 Mean :40.44
## 3rd Qu.:107.00 3rd Qu.:24.50 3rd Qu.:45.25
## Max. :308.00 Max. :40.00 Max. :81.00
## oocytes embryos
## Min. :21.00 Min. :11.00
## 1st Qu.:23.00 1st Qu.:13.00
## Median :25.50 Median :15.50
## Mean :25.62 Mean :16.31
## 3rd Qu.:27.25 3rd Qu.:20.00
## Max. :35.00 Max. :23.00
# show the frequency of age samples that repeated in the subset data
hist(as.numeric(as.character(sub_df[['age']])))
## Warning in hist(as.numeric(as.character(sub_df[["age"]]))): NAs introduced
## by coercion
hist(as.numeric(as.character(sub_df[['E2']])))
hist(as.numeric(as.character(sub_df[['oocytes']])))
hist(as.numeric(as.character(sub_df[['embryos']])))
table(sub_df$age)
##
## 32 33 35 36 37 39 41 43 H_E2 Lw_E2
## 1 2 1 1 1 2 1 1 3 3
table(sub_df$age, sub_df$embryos)
##
## 11 12 13 14 15 16 17 18 20 21 22 23
## 32 0 0 0 0 0 0 0 0 1 0 0 0
## 33 0 0 0 0 0 0 1 0 1 0 0 0
## 35 0 0 0 0 0 0 0 0 0 0 0 1
## 36 1 0 0 0 0 0 0 0 0 0 0 0
## 37 0 0 0 0 1 0 0 0 0 0 0 0
## 39 0 0 0 0 0 0 0 1 0 1 0 0
## 41 0 1 0 0 0 0 0 0 0 0 0 0
## 43 0 0 1 0 0 0 0 0 0 0 0 0
## H_E2 1 0 1 0 1 0 0 0 0 0 0 0
## Lw_E2 0 0 0 1 0 1 0 0 0 0 1 0
ggplot(sub_df, aes(age, oocytes, color = embryos)) + geom_point() + stat_smooth(method=lm, se=FALSE, fullrange=TRUE)
ggplot(sub_df, aes(age, E2, color = embryos)) + geom_point() + stat_smooth(method=lm, se=FALSE, fullrange=TRUE)
ggplot(sub_df, aes(age, embryos)) + geom_boxplot()
ggplot(sub_df, aes(age, low_AFC)) + geom_boxplot()
Study objective: The main reason I picked this dataset was to know the factors that affect women fertility. Also see weather the women that have lower antral follicle counts (AFC) affects women fertility. Some opinions claim that when women reached a certain age their fertility decreases and hence their chances in pregnancy decline.
Main outcome: The results based on taking a sample dataset from the whole dataset. The age range was 32 and older, oocytes more than 20 and embryos more than 10.
Results: Based on the provided dataset, the results show that age 36 has lower oocytes and embryos. However, as women gets older the number of oocytes increased. There was one outlier record for age 35.
Age 33 years old has the maximum Estradiol hermone amount E2 - around 50, which is responsible for the growth and development of female sex organs. Also, women older than 43 years old has a low AFC and low number of embryos.
comments:
Note that the data frame includes information about age, number of Embryos and Oocytes that women have according to their age. This analysis will give an answer if the women age really effect of her fertility.