library(readr)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readxl)
library(datasets)
library(tidyr)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
cancer <- read_csv ("cancer.csv")
## Parsed with column specification:
## cols(
## age = col_double(),
## sex = col_character(),
## height = col_double(),
## weight = col_double(),
## dateOfoperation = col_character(),
## cancerStaging = col_character(),
## hospitalization = col_double(),
## diseaseCode = col_character()
## )
## Warning: 278 parsing failures.
## row col expected actual file
## 1294 height a double 기록없음 'cancer.csv'
## 1294 weight a double 기록없음 'cancer.csv'
## 1365 height a double 기록없음 'cancer.csv'
## 1430 height a double 기록없음 'cancer.csv'
## 1459 height a double 기록없음 'cancer.csv'
## .... ...... ........ ........ ............
## See problems(...) for more details.
View(cancer)
head(cancer)
## # A tibble: 6 x 8
## age sex height weight dateOfoperation cancerStaging hospitalization
## <dbl> <chr> <dbl> <dbl> <chr> <chr> <dbl>
## 1 75 남 161 64 2011.6.22 I 48
## 2 52 여 177. 75.3 2011.5.19 IV 17
## 3 67 여 154 65.6 2011.5.31 III 10
## 4 62 남 162 57 2011.6.21 I 11
## 5 70 남 171 65 2011.6.9 II 10
## 6 76 여 171 87 2011.6.16 III 10
## # ... with 1 more variable: diseaseCode <chr>
str(cancer)
## Classes 'tbl_df', 'tbl' and 'data.frame': 18310 obs. of 8 variables:
## $ age : num 75 52 67 62 70 76 55 72 64 71 ...
## $ sex : chr "남" "여" "여" "남" ...
## $ height : num 161 177 154 162 171 ...
## $ weight : num 64 75.3 65.6 57 65 87 77 55 67 55.5 ...
## $ dateOfoperation: chr "2011.6.22" "2011.5.19" "2011.5.31" "2011.6.21" ...
## $ cancerStaging : chr "I" "IV" "III" "I" ...
## $ hospitalization: num 48 17 10 11 10 10 12 18 15 35 ...
## $ diseaseCode : chr "C187" "C187" "C187" "C187" ...
## - attr(*, "problems")=Classes 'tbl_df', 'tbl' and 'data.frame': 278 obs. of 5 variables:
## ..$ row : int 1294 1294 1365 1430 1459 1468 1468 1521 1521 1565 ...
## ..$ col : chr "height" "weight" "height" "height" ...
## ..$ expected: chr "a double" "a double" "a double" "a double" ...
## ..$ actual : chr "기록없음" "기록없음" "기록없음" "기록없음" ...
## ..$ file : chr "'cancer.csv'" "'cancer.csv'" "'cancer.csv'" "'cancer.csv'" ...
## - attr(*, "spec")=
## .. cols(
## .. age = col_double(),
## .. sex = col_character(),
## .. height = col_double(),
## .. weight = col_double(),
## .. dateOfoperation = col_character(),
## .. cancerStaging = col_character(),
## .. hospitalization = col_double(),
## .. diseaseCode = col_character()
## .. )
dim(cancer)
## [1] 18310 8
Remove NA
newd <- na.omit(cancer)
Basic stat information with numeric variables I checked basic stat information with numeric variables. There were somo outliers for each variable and their mean age was
summary(cancer$height, na.rm = T)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 100.0 155.0 162.0 161.6 168.0 196.6 194
summary(cancer$weight, na.rm = T)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 26.00 54.00 61.00 61.48 68.40 118.00 84
summary(cancer$age, na.rm = T)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 55.00 64.00 63.51 72.00 102.00
summary(cancer$hospitalization, na.rm = T)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 10.00 13.00 16.83 19.00 354.00
boxplot(cancer$age)
Check Standard Deviation
sd(newd$height, na.rm = T )
## [1] 8.906801
sd(newd$weight, na.rm = T)
## [1] 10.74521
sd(newd$age, na.rm = T)
## [1] 11.58417
sd(newd$hospitalization, na.rm = T)
## [1] 11.83359
Check data with gender. Proportionally males have little higher rates to have cancer.
simplestat <- newd
simplestat %>%
group_by(sex) %>%
summarise(mean_age = mean(age),
sum_age = sum(age),
median_age = median(age),
n = n())
## # A tibble: 2 x 5
## sex mean_age sum_age median_age n
## <chr> <dbl> <dbl> <dbl> <int>
## 1 남 63.2 694419 64 10991
## 2 여 64.0 455278 66 7116
** Check data with cancer staging. This data visually explains the frequency of the data which was the 3rd cancer staging has the highest frequency.
staging <- table(newd$cancerStaging)
staging
##
## I II III IV 기록없음
## 4041 5176 6515 2146 229
barplot(staging)
title(main="Cancer Staging")
title(xlab = "staging")
hospital <- tapply(newd$hospitalization, newd$cancerStaging, mean)
hospital
## I II III IV 기록없음
## 14.71616 16.93083 16.57452 20.39795 20.73362
barplot(hospital, ylim = c(0,50))
boxplot(newd$hospitalization~ newd$cancerStaging)
Find correlation between variables First I simply checked the correlation between a couple of variables and found (height and weight), (age and hospitalization) are moderately correlated, and days of hospotalization and height have no relation.
cor(newd$height, newd$weight)
## [1] 0.6146694
cor(newd$age, newd$hospitalization)
## [1] 0.06249857
cor(newd$hospitalization, newd$height)
## [1] -0.01643714
cor(newd$hospitalization, newd$age)
## [1] 0.06249857
With the above data, hypothesis are designed below. #Correlation
Test 1. #Ho: p= 0 There is no relationship between weight and height.
** #H1: p=/ 0 There is a relationship between weight and height. ** Pearson’s correlation was used. There was about 0.06(r) relationship between weight and height which is positive and very low. P value was low which means we can reject the null, and the data is between the confidence interval. Also, t is larger than alpha.
Test 2. #Ho: p= 0 There is no relationship between age and hospitalization. ** #H1: p=/ 0 There is a relationship between age and hospitalization.** Pearson’s correlation was used and there was about 0.61 relationship between weight and height which is positive and moderate. P value was low which means we can reject the null, and the data is between the confidence interval. Also, t is larger than alpha.
cor.test(newd$hospitalization, newd$age)
##
## Pearson's product-moment correlation
##
## data: newd$hospitalization and newd$age
## t = 8.426, df = 18105, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.04797658 0.07699413
## sample estimates:
## cor
## 0.06249857
cor(newd[,c(1,3,4)])
## age height weight
## age 1.00000000 -0.01701435 -0.02763348
## height -0.01701435 1.00000000 0.61466938
## weight -0.02763348 0.61466938 1.00000000
cor.test(newd$height, newd$weight)
##
## Pearson's product-moment correlation
##
## data: newd$height and newd$weight
## t = 104.85, df = 18105, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6055250 0.6236515
## sample estimates:
## cor
## 0.6146694
Covarience and variance age and weight, age and height have negative correlation.
var(newd$height, na.rm= T)
## [1] 79.33111
var(newd$age, newd$weight)
## [1] -3.439661
cov(newd$age, newd$weight, use = "complete.obs")
## [1] -3.439661
var(newd[, c(1,3,4)], na.rm = T)
## age height weight
## age 134.193098 -1.755505 -3.439661
## height -1.755505 79.331107 58.827217
## weight -3.439661 58.827217 115.459567
plot: weight and height has a moderate linear positive relationship. plot: age and hospitalization have a linear relationship.
DF2 <- data.frame(newd$height, newd$weight)
plot(DF2)
DF5 <- data.frame(newd$age, newd$hospitalization)
plot(DF5)
describe the process that you followed in this project as well as about what you learned from these data.
I first installed packages and look through the data by summary function. After figuring out the structures and some numeric and categorical information of the data, I removed the NAs. And, through the process of recognizing basic stat information with numeric variables(mean, range, median, standard deviation) I decided what my hypothesis will be for the correlation analysis. As having 4 numerically recorded variables which can be compared, There were some options. I used Pearson’s correlation and data were all rejected, which means there was correlation. What I learned from the data was below:
1. Males had a proportionally higher rate of cancer than female.
2. The highest frequency of ‘cancer staging’ out of four was three, though data did not have much gap.
3. There were extreme outliers on hospitalization. 4. Weight and height for people who have a cancer have a moderate linear positive correlation. 5. Age and hospitalization for people who have a cancer has a weak linear positive relationship.