library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.1
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.2 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
smoking_data <- read_csv("https://raw.githubusercontent.com/uzmabb182/CUNY-SPS-Assignments/main/MQari_R_Bridge_Final_Project/smoking.csv")
## Warning: Missing column names filled in: 'X1' [1]
##
## -- Column specification --------------------------------------------------------
## cols(
## X1 = col_double(),
## gender = col_character(),
## age = col_double(),
## marital_status = col_character(),
## highest_qualification = col_character(),
## nationality = col_character(),
## ethnicity = col_character(),
## gross_income = col_character(),
## region = col_character(),
## smoke = col_character(),
## amt_weekends = col_double(),
## amt_weekdays = col_double(),
## type = col_character()
## )
head(smoking_data)
## # A tibble: 6 x 13
## X1 gender age marital_status highest_qualification nationality ethnicity
## <dbl> <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 1 Male 38 Divorced No Qualification British White
## 2 2 Female 42 Single No Qualification British White
## 3 3 Male 40 Married Degree English White
## 4 4 Female 40 Married Degree English White
## 5 5 Female 39 Married GCSE/O Level British White
## 6 6 Female 37 Married GCSE/O Level British White
## # ... with 6 more variables: gross_income <chr>, region <chr>, smoke <chr>,
## # amt_weekends <dbl>, amt_weekdays <dbl>, type <chr>
nrow(smoking_data)
## [1] 1691
summary(smoking_data)
## X1 gender age marital_status
## Min. : 1.0 Length:1691 Min. :16.00 Length:1691
## 1st Qu.: 423.5 Class :character 1st Qu.:34.00 Class :character
## Median : 846.0 Mode :character Median :48.00 Mode :character
## Mean : 846.0 Mean :49.84
## 3rd Qu.:1268.5 3rd Qu.:65.50
## Max. :1691.0 Max. :97.00
##
## highest_qualification nationality ethnicity gross_income
## Length:1691 Length:1691 Length:1691 Length:1691
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## region smoke amt_weekends amt_weekdays
## Length:1691 Length:1691 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.:10.00 1st Qu.: 7.00
## Mode :character Mode :character Median :15.00 Median :12.00
## Mean :16.41 Mean :13.75
## 3rd Qu.:20.00 3rd Qu.:20.00
## Max. :60.00 Max. :55.00
## NA's :1270 NA's :1270
## type
## Length:1691
## Class :character
## Mode :character
##
##
##
##
quantile(smoking_data$age, 0.25)
## 25%
## 34
quantile(smoking_data$age, 0.50)
## 50%
## 48
quantile(smoking_data$age, 0.75)
## 75%
## 65.5
mean <- mean(smoking_data$age)
mean
## [1] 49.83619
median <- median(smoking_data$age)
median
## [1] 48
slice(smoking_data, 1:7)
## # A tibble: 7 x 13
## X1 gender age marital_status highest_qualification nationality ethnicity
## <dbl> <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 1 Male 38 Divorced No Qualification British White
## 2 2 Female 42 Single No Qualification British White
## 3 3 Male 40 Married Degree English White
## 4 4 Female 40 Married Degree English White
## 5 5 Female 39 Married GCSE/O Level British White
## 6 6 Female 37 Married GCSE/O Level British White
## 7 7 Male 53 Married Degree British White
## # ... with 6 more variables: gross_income <chr>, region <chr>, smoke <chr>,
## # amt_weekends <dbl>, amt_weekdays <dbl>, type <chr>
select(smoking_data, gender, age, smoke)
## # A tibble: 1,691 x 3
## gender age smoke
## <chr> <dbl> <chr>
## 1 Male 38 No
## 2 Female 42 Yes
## 3 Male 40 No
## 4 Female 40 No
## 5 Female 39 No
## 6 Female 37 No
## 7 Male 53 Yes
## 8 Male 44 No
## 9 Male 40 Yes
## 10 Female 41 Yes
## # ... with 1,681 more rows
Select rows 1 and 7 using slice
slice(smoking_data, c(1, 7, 4))
## # A tibble: 3 x 13
## X1 gender age marital_status highest_qualification nationality ethnicity
## <dbl> <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 1 Male 38 Divorced No Qualification British White
## 2 7 Male 53 Married Degree British White
## 3 4 Female 40 Married Degree English White
## # ... with 6 more variables: gross_income <chr>, region <chr>, smoke <chr>,
## # amt_weekends <dbl>, amt_weekdays <dbl>, type <chr>
x <- c('age', 'gender', 'highest_qualification')
x
## [1] "age" "gender" "highest_qualification"
select(smoking_data,-all_of(x))
## # A tibble: 1,691 x 10
## X1 marital_status nationality ethnicity gross_income region smoke
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 1 Divorced British White 2,600 to 5,200 The North No
## 2 2 Single British White Under 2,600 The North Yes
## 3 3 Married English White 28,600 to 36,400 The North No
## 4 4 Married English White 10,400 to 15,600 The North No
## 5 5 Married British White 2,600 to 5,200 The North No
## 6 6 Married British White 15,600 to 20,800 The North No
## 7 7 Married British White Above 36,400 The North Yes
## 8 8 Single English White 10,400 to 15,600 The North No
## 9 9 Single English White 2,600 to 5,200 The North Yes
## 10 10 Married English White 5,200 to 10,400 The North Yes
## # ... with 1,681 more rows, and 3 more variables: amt_weekends <dbl>,
## # amt_weekdays <dbl>, type <chr>
filter(smoking_data, marital_status=='Married')
## # A tibble: 812 x 13
## X1 gender age marital_status highest_qualification nationality ethnicity
## <dbl> <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 3 Male 40 Married Degree English White
## 2 4 Female 40 Married Degree English White
## 3 5 Female 39 Married GCSE/O Level British White
## 4 6 Female 37 Married GCSE/O Level British White
## 5 7 Male 53 Married Degree British White
## 6 10 Female 41 Married No Qualification English White
## 7 12 Male 49 Married No Qualification British White
## 8 13 Male 29 Married Degree English White
## 9 20 Male 55 Married No Qualification English White
## 10 21 Female 34 Married GCSE/CSE British White
## # ... with 802 more rows, and 6 more variables: gross_income <chr>,
## # region <chr>, smoke <chr>, amt_weekends <dbl>, amt_weekdays <dbl>,
## # type <chr>
filter(smoking_data, gender=='Male'& smoke=='Yes'& age > 20 )
## # A tibble: 175 x 13
## X1 gender age marital_status highest_qualification nationality ethnicity
## <dbl> <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 7 Male 53 Married Degree British White
## 2 9 Male 40 Single GCSE/CSE English White
## 3 50 Male 59 Married Other/Sub Degree English White
## 4 55 Male 28 Married Other/Sub Degree British White
## 5 72 Male 67 Widowed No Qualification British White
## 6 75 Male 40 Married Other/Sub Degree English White
## 7 78 Male 43 Divorced Other/Sub Degree English White
## 8 86 Male 27 Single ONC/BTEC British White
## 9 107 Male 57 Separated No Qualification Welsh White
## 10 118 Male 71 Widowed No Qualification English White
## # ... with 165 more rows, and 6 more variables: gross_income <chr>,
## # region <chr>, smoke <chr>, amt_weekends <dbl>, amt_weekdays <dbl>,
## # type <chr>
smoking_data <- mutate(smoking_data, smoke_amt_total=(amt_weekends+amt_weekdays))
smoking_data
## # A tibble: 1,691 x 14
## X1 gender age marital_status highest_qualification nationality ethnicity
## <dbl> <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 1 Male 38 Divorced No Qualification British White
## 2 2 Female 42 Single No Qualification British White
## 3 3 Male 40 Married Degree English White
## 4 4 Female 40 Married Degree English White
## 5 5 Female 39 Married GCSE/O Level British White
## 6 6 Female 37 Married GCSE/O Level British White
## 7 7 Male 53 Married Degree British White
## 8 8 Male 44 Single Degree English White
## 9 9 Male 40 Single GCSE/CSE English White
## 10 10 Female 41 Married No Qualification English White
## # ... with 1,681 more rows, and 7 more variables: gross_income <chr>,
## # region <chr>, smoke <chr>, amt_weekends <dbl>, amt_weekdays <dbl>,
## # type <chr>, smoke_amt_total <dbl>
summarize(smoking_data, mean(age))
## # A tibble: 1 x 1
## `mean(age)`
## <dbl>
## 1 49.8
gender <- group_by(smoking_data, gender)
summarize(gender, mean(age), median(age))
## # A tibble: 2 x 3
## gender `mean(age)` `median(age)`
## <chr> <dbl> <dbl>
## 1 Female 50.3 48
## 2 Male 49.2 48
gender <- group_by(smoking_data, gender, smoke)
summarize(gender, mean(age), median(age))
## `summarise()` has grouped output by 'gender'. You can override using the `.groups` argument.
## # A tibble: 4 x 4
## # Groups: gender [2]
## gender smoke `mean(age)` `median(age)`
## <chr> <chr> <dbl> <dbl>
## 1 Female No 53.0 54
## 2 Female Yes 42.2 40
## 3 Male No 51.2 51
## 4 Male Yes 43.4 41
count(smoking_data, gender)
## # A tibble: 2 x 2
## gender n
## <chr> <int>
## 1 Female 965
## 2 Male 726
head(smoking_data)
## # A tibble: 6 x 14
## X1 gender age marital_status highest_qualification nationality ethnicity
## <dbl> <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 1 Male 38 Divorced No Qualification British White
## 2 2 Female 42 Single No Qualification British White
## 3 3 Male 40 Married Degree English White
## 4 4 Female 40 Married Degree English White
## 5 5 Female 39 Married GCSE/O Level British White
## 6 6 Female 37 Married GCSE/O Level British White
## # ... with 7 more variables: gross_income <chr>, region <chr>, smoke <chr>,
## # amt_weekends <dbl>, amt_weekdays <dbl>, type <chr>, smoke_amt_total <dbl>
education_category <- unique(smoking_data$highest_qualification)
paste("The number of unique categories for education is : ", education_category)
## [1] "The number of unique categories for education is : No Qualification"
## [2] "The number of unique categories for education is : Degree"
## [3] "The number of unique categories for education is : GCSE/O Level"
## [4] "The number of unique categories for education is : GCSE/CSE"
## [5] "The number of unique categories for education is : Other/Sub Degree"
## [6] "The number of unique categories for education is : Higher/Sub Degree"
## [7] "The number of unique categories for education is : ONC/BTEC"
## [8] "The number of unique categories for education is : A Levels"
length(education_category)
## [1] 8
str(smoking_data)
## spec_tbl_df [1,691 x 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ X1 : num [1:1691] 1 2 3 4 5 6 7 8 9 10 ...
## $ gender : chr [1:1691] "Male" "Female" "Male" "Female" ...
## $ age : num [1:1691] 38 42 40 40 39 37 53 44 40 41 ...
## $ marital_status : chr [1:1691] "Divorced" "Single" "Married" "Married" ...
## $ highest_qualification: chr [1:1691] "No Qualification" "No Qualification" "Degree" "Degree" ...
## $ nationality : chr [1:1691] "British" "British" "English" "English" ...
## $ ethnicity : chr [1:1691] "White" "White" "White" "White" ...
## $ gross_income : chr [1:1691] "2,600 to 5,200" "Under 2,600" "28,600 to 36,400" "10,400 to 15,600" ...
## $ region : chr [1:1691] "The North" "The North" "The North" "The North" ...
## $ smoke : chr [1:1691] "No" "Yes" "No" "No" ...
## $ amt_weekends : num [1:1691] NA 12 NA NA NA NA 6 NA 8 15 ...
## $ amt_weekdays : num [1:1691] NA 12 NA NA NA NA 6 NA 8 12 ...
## $ type : chr [1:1691] NA "Packets" NA NA ...
## $ smoke_amt_total : num [1:1691] NA 24 NA NA NA NA 12 NA 16 27 ...
## - attr(*, "spec")=
## .. cols(
## .. X1 = col_double(),
## .. gender = col_character(),
## .. age = col_double(),
## .. marital_status = col_character(),
## .. highest_qualification = col_character(),
## .. nationality = col_character(),
## .. ethnicity = col_character(),
## .. gross_income = col_character(),
## .. region = col_character(),
## .. smoke = col_character(),
## .. amt_weekends = col_double(),
## .. amt_weekdays = col_double(),
## .. type = col_character()
## .. )
smoking_data[c('gross_income_lower_range', 'gross_income_upper_range')] <- str_split_fixed(smoking_data$gross_income, " ", 2)
head(smoking_data)
## # A tibble: 6 x 16
## X1 gender age marital_status highest_qualification nationality ethnicity
## <dbl> <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 1 Male 38 Divorced No Qualification British White
## 2 2 Female 42 Single No Qualification British White
## 3 3 Male 40 Married Degree English White
## 4 4 Female 40 Married Degree English White
## 5 5 Female 39 Married GCSE/O Level British White
## 6 6 Female 37 Married GCSE/O Level British White
## # ... with 9 more variables: gross_income <chr>, region <chr>, smoke <chr>,
## # amt_weekends <dbl>, amt_weekdays <dbl>, type <chr>, smoke_amt_total <dbl>,
## # gross_income_lower_range <chr>, gross_income_upper_range <chr>
smoking_data$gross_income_upper_range <- gsub('to', '', as.character(smoking_data$gross_income_upper_range))
smoking_data
## # A tibble: 1,691 x 16
## X1 gender age marital_status highest_qualification nationality ethnicity
## <dbl> <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 1 Male 38 Divorced No Qualification British White
## 2 2 Female 42 Single No Qualification British White
## 3 3 Male 40 Married Degree English White
## 4 4 Female 40 Married Degree English White
## 5 5 Female 39 Married GCSE/O Level British White
## 6 6 Female 37 Married GCSE/O Level British White
## 7 7 Male 53 Married Degree British White
## 8 8 Male 44 Single Degree English White
## 9 9 Male 40 Single GCSE/CSE English White
## 10 10 Female 41 Married No Qualification English White
## # ... with 1,681 more rows, and 9 more variables: gross_income <chr>,
## # region <chr>, smoke <chr>, amt_weekends <dbl>, amt_weekdays <dbl>,
## # type <chr>, smoke_amt_total <dbl>, gross_income_lower_range <chr>,
## # gross_income_upper_range <chr>
smoking_data$gross_income_lower_range <- sub("Under", "1,000", smoking_data$gross_income_lower_range)
smoking_data
## # A tibble: 1,691 x 16
## X1 gender age marital_status highest_qualification nationality ethnicity
## <dbl> <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 1 Male 38 Divorced No Qualification British White
## 2 2 Female 42 Single No Qualification British White
## 3 3 Male 40 Married Degree English White
## 4 4 Female 40 Married Degree English White
## 5 5 Female 39 Married GCSE/O Level British White
## 6 6 Female 37 Married GCSE/O Level British White
## 7 7 Male 53 Married Degree British White
## 8 8 Male 44 Single Degree English White
## 9 9 Male 40 Single GCSE/CSE English White
## 10 10 Female 41 Married No Qualification English White
## # ... with 1,681 more rows, and 9 more variables: gross_income <chr>,
## # region <chr>, smoke <chr>, amt_weekends <dbl>, amt_weekdays <dbl>,
## # type <chr>, smoke_amt_total <dbl>, gross_income_lower_range <chr>,
## # gross_income_upper_range <chr>
smoking_data$gross_income_lower_range <- sub("Above", "5,000", smoking_data$gross_income_lower_range)
smoking_data
## # A tibble: 1,691 x 16
## X1 gender age marital_status highest_qualification nationality ethnicity
## <dbl> <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 1 Male 38 Divorced No Qualification British White
## 2 2 Female 42 Single No Qualification British White
## 3 3 Male 40 Married Degree English White
## 4 4 Female 40 Married Degree English White
## 5 5 Female 39 Married GCSE/O Level British White
## 6 6 Female 37 Married GCSE/O Level British White
## 7 7 Male 53 Married Degree British White
## 8 8 Male 44 Single Degree English White
## 9 9 Male 40 Single GCSE/CSE English White
## 10 10 Female 41 Married No Qualification English White
## # ... with 1,681 more rows, and 9 more variables: gross_income <chr>,
## # region <chr>, smoke <chr>, amt_weekends <dbl>, amt_weekdays <dbl>,
## # type <chr>, smoke_amt_total <dbl>, gross_income_lower_range <chr>,
## # gross_income_upper_range <chr>
library(ggplot2)
my_graph <- ggplot(smoking_data, aes(x = gender, y = smoke_amt_total)) +
geom_point(aes(color = factor(smoke)))
my_graph +
labs(
title = "Amount of Smoking on the Weekends per Gender "
)
## Warning: Removed 1270 rows containing missing values (geom_point).
new_graph<-ggplot(smoking_data, aes(x = log(age),
y = log(smoke_amt_total))) +
geom_point(aes(color = factor(smoke))) +
stat_smooth(method = "lm",
col = "#C42126",
se = FALSE, size = 1)
new_graph
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1271 rows containing non-finite values (stat_smooth).
## Warning: Removed 1270 rows containing missing values (geom_point).
ggplot(smoking_data, aes(x=gross_income_upper_range, y=smoke_amt_total)) +
geom_bar(stat="identity", position = "dodge")+
scale_fill_brewer(palette = "Pastel2")
## Warning: Removed 1270 rows containing missing values (geom_bar).
perf <-ggplot(data=smoking_data, aes(x=gross_income_upper_range, y=smoke_amt_total,fill=gross_income_upper_range))+
geom_bar(stat="identity")
perf
## Warning: Removed 1270 rows containing missing values (position_stack).
perf <-ggplot(data=smoking_data, aes(x=highest_qualification, y=smoke_amt_total,fill=highest_qualification))+
geom_bar(stat="identity")
perf
## Warning: Removed 1270 rows containing missing values (position_stack).
ggplot(smoking_data, aes(x = gross_income_upper_range, y = smoke_amt_total, fill = gross_income_upper_range)) +
geom_boxplot() +
stat_summary(fun = "mean", geom = "point", shape = 8,
size = 2, color = "white")
## Warning: Removed 1270 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1270 rows containing non-finite values (stat_summary).
# Brief Conclusion: * The amount of smoking on the weekends by male is slightly higher than the female. * There is a consistant slight increase increase in the amount of smoking by age. * The gropu with the gross income of around $10,000 tends to smoke the highest as compared to the others. * The ‘no qualification’ category is the highest group of smokers. * The spread of data is highest among the highest gross income class and the since the mean is greater than median, the data is right skewed.
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE
parameter was added to the code chunk to prevent printing of the R code that generated the plot.