Import the library for reading and manipulating the data

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.1.1

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.2     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1

## Warning: package 'ggplot2' was built under R version 4.1.2

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Objective for Analysis:

Relationship between gender and rate of smoking.
Relationship between education and rate of smoking.

Reading smoking.csv from Github

smoking_data <- read_csv("https://raw.githubusercontent.com/uzmabb182/CUNY-SPS-Assignments/main/MQari_R_Bridge_Final_Project/smoking.csv")

## Warning: Missing column names filled in: 'X1' [1]

## 
## -- Column specification --------------------------------------------------------
## cols(
##   X1 = col_double(),
##   gender = col_character(),
##   age = col_double(),
##   marital_status = col_character(),
##   highest_qualification = col_character(),
##   nationality = col_character(),
##   ethnicity = col_character(),
##   gross_income = col_character(),
##   region = col_character(),
##   smoke = col_character(),
##   amt_weekends = col_double(),
##   amt_weekdays = col_double(),
##   type = col_character()
## )

Data Exploration:

head(smoking_data)

## # A tibble: 6 x 13
##      X1 gender   age marital_status highest_qualification nationality ethnicity
##   <dbl> <chr>  <dbl> <chr>          <chr>                 <chr>       <chr>    
## 1     1 Male      38 Divorced       No Qualification      British     White    
## 2     2 Female    42 Single         No Qualification      British     White    
## 3     3 Male      40 Married        Degree                English     White    
## 4     4 Female    40 Married        Degree                English     White    
## 5     5 Female    39 Married        GCSE/O Level          British     White    
## 6     6 Female    37 Married        GCSE/O Level          British     White    
## # ... with 6 more variables: gross_income <chr>, region <chr>, smoke <chr>,
## #   amt_weekends <dbl>, amt_weekdays <dbl>, type <chr>

To calculate the number of rows

nrow(smoking_data)

## [1] 1691

# Summarizing the dataset

summary(smoking_data)

##        X1            gender               age        marital_status    
##  Min.   :   1.0   Length:1691        Min.   :16.00   Length:1691       
##  1st Qu.: 423.5   Class :character   1st Qu.:34.00   Class :character  
##  Median : 846.0   Mode  :character   Median :48.00   Mode  :character  
##  Mean   : 846.0                      Mean   :49.84                     
##  3rd Qu.:1268.5                      3rd Qu.:65.50                     
##  Max.   :1691.0                      Max.   :97.00                     
##                                                                        
##  highest_qualification nationality         ethnicity         gross_income      
##  Length:1691           Length:1691        Length:1691        Length:1691       
##  Class :character      Class :character   Class :character   Class :character  
##  Mode  :character      Mode  :character   Mode  :character   Mode  :character  
##                                                                                
##                                                                                
##                                                                                
##                                                                                
##     region             smoke            amt_weekends    amt_weekdays  
##  Length:1691        Length:1691        Min.   : 0.00   Min.   : 0.00  
##  Class :character   Class :character   1st Qu.:10.00   1st Qu.: 7.00  
##  Mode  :character   Mode  :character   Median :15.00   Median :12.00  
##                                        Mean   :16.41   Mean   :13.75  
##                                        3rd Qu.:20.00   3rd Qu.:20.00  
##                                        Max.   :60.00   Max.   :55.00  
##                                        NA's   :1270    NA's   :1270   
##      type          
##  Length:1691       
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

Statistical Calculations:

Finding the first quartile

quantile(smoking_data$age, 0.25)

## 25% 
##  34

quantile(smoking_data$age, 0.50)

## 50% 
##  48

quantile(smoking_data$age, 0.75)

##  75% 
## 65.5

mean <- mean(smoking_data$age) 
mean

## [1] 49.83619

median <- median(smoking_data$age) 
median

## [1] 48

Data Wrangling:

Select the rows using slice

slice(smoking_data, 1:7)

## # A tibble: 7 x 13
##      X1 gender   age marital_status highest_qualification nationality ethnicity
##   <dbl> <chr>  <dbl> <chr>          <chr>                 <chr>       <chr>    
## 1     1 Male      38 Divorced       No Qualification      British     White    
## 2     2 Female    42 Single         No Qualification      British     White    
## 3     3 Male      40 Married        Degree                English     White    
## 4     4 Female    40 Married        Degree                English     White    
## 5     5 Female    39 Married        GCSE/O Level          British     White    
## 6     6 Female    37 Married        GCSE/O Level          British     White    
## 7     7 Male      53 Married        Degree                British     White    
## # ... with 6 more variables: gross_income <chr>, region <chr>, smoke <chr>,
## #   amt_weekends <dbl>, amt_weekdays <dbl>, type <chr>

Select the columns

select(smoking_data, gender, age, smoke)

## # A tibble: 1,691 x 3
##    gender   age smoke
##    <chr>  <dbl> <chr>
##  1 Male      38 No   
##  2 Female    42 Yes  
##  3 Male      40 No   
##  4 Female    40 No   
##  5 Female    39 No   
##  6 Female    37 No   
##  7 Male      53 Yes  
##  8 Male      44 No   
##  9 Male      40 Yes  
## 10 Female    41 Yes  
## # ... with 1,681 more rows

Select rows 1 and 7 using slice

slice(smoking_data, c(1, 7, 4))

## # A tibble: 3 x 13
##      X1 gender   age marital_status highest_qualification nationality ethnicity
##   <dbl> <chr>  <dbl> <chr>          <chr>                 <chr>       <chr>    
## 1     1 Male      38 Divorced       No Qualification      British     White    
## 2     7 Male      53 Married        Degree                British     White    
## 3     4 Female    40 Married        Degree                English     White    
## # ... with 6 more variables: gross_income <chr>, region <chr>, smoke <chr>,
## #   amt_weekends <dbl>, amt_weekdays <dbl>, type <chr>

Subsetting the dataset by column

x <- c('age', 'gender', 'highest_qualification')
x

## [1] "age"                   "gender"                "highest_qualification"

Subsetting the dataset by excluding the column assigned to ‘x’

select(smoking_data,-all_of(x))

## # A tibble: 1,691 x 10
##       X1 marital_status nationality ethnicity gross_income     region    smoke
##    <dbl> <chr>          <chr>       <chr>     <chr>            <chr>     <chr>
##  1     1 Divorced       British     White     2,600 to 5,200   The North No   
##  2     2 Single         British     White     Under 2,600      The North Yes  
##  3     3 Married        English     White     28,600 to 36,400 The North No   
##  4     4 Married        English     White     10,400 to 15,600 The North No   
##  5     5 Married        British     White     2,600 to 5,200   The North No   
##  6     6 Married        British     White     15,600 to 20,800 The North No   
##  7     7 Married        British     White     Above 36,400     The North Yes  
##  8     8 Single         English     White     10,400 to 15,600 The North No   
##  9     9 Single         English     White     2,600 to 5,200   The North Yes  
## 10    10 Married        English     White     5,200 to 10,400  The North Yes  
## # ... with 1,681 more rows, and 3 more variables: amt_weekends <dbl>,
## #   amt_weekdays <dbl>, type <chr>

Filter data using filter

filter(smoking_data, marital_status=='Married')

## # A tibble: 812 x 13
##       X1 gender   age marital_status highest_qualification nationality ethnicity
##    <dbl> <chr>  <dbl> <chr>          <chr>                 <chr>       <chr>    
##  1     3 Male      40 Married        Degree                English     White    
##  2     4 Female    40 Married        Degree                English     White    
##  3     5 Female    39 Married        GCSE/O Level          British     White    
##  4     6 Female    37 Married        GCSE/O Level          British     White    
##  5     7 Male      53 Married        Degree                British     White    
##  6    10 Female    41 Married        No Qualification      English     White    
##  7    12 Male      49 Married        No Qualification      British     White    
##  8    13 Male      29 Married        Degree                English     White    
##  9    20 Male      55 Married        No Qualification      English     White    
## 10    21 Female    34 Married        GCSE/CSE              British     White    
## # ... with 802 more rows, and 6 more variables: gross_income <chr>,
## #   region <chr>, smoke <chr>, amt_weekends <dbl>, amt_weekdays <dbl>,
## #   type <chr>

filter data by mutiple criteria

filter(smoking_data, gender=='Male'& smoke=='Yes'& age > 20 )

## # A tibble: 175 x 13
##       X1 gender   age marital_status highest_qualification nationality ethnicity
##    <dbl> <chr>  <dbl> <chr>          <chr>                 <chr>       <chr>    
##  1     7 Male      53 Married        Degree                British     White    
##  2     9 Male      40 Single         GCSE/CSE              English     White    
##  3    50 Male      59 Married        Other/Sub Degree      English     White    
##  4    55 Male      28 Married        Other/Sub Degree      British     White    
##  5    72 Male      67 Widowed        No Qualification      British     White    
##  6    75 Male      40 Married        Other/Sub Degree      English     White    
##  7    78 Male      43 Divorced       Other/Sub Degree      English     White    
##  8    86 Male      27 Single         ONC/BTEC              British     White    
##  9   107 Male      57 Separated      No Qualification      Welsh       White    
## 10   118 Male      71 Widowed        No Qualification      English     White    
## # ... with 165 more rows, and 6 more variables: gross_income <chr>,
## #   region <chr>, smoke <chr>, amt_weekends <dbl>, amt_weekdays <dbl>,
## #   type <chr>

Creating a new column based on other column and assigning name to it

smoking_data <- mutate(smoking_data, smoke_amt_total=(amt_weekends+amt_weekdays))
smoking_data

## # A tibble: 1,691 x 14
##       X1 gender   age marital_status highest_qualification nationality ethnicity
##    <dbl> <chr>  <dbl> <chr>          <chr>                 <chr>       <chr>    
##  1     1 Male      38 Divorced       No Qualification      British     White    
##  2     2 Female    42 Single         No Qualification      British     White    
##  3     3 Male      40 Married        Degree                English     White    
##  4     4 Female    40 Married        Degree                English     White    
##  5     5 Female    39 Married        GCSE/O Level          British     White    
##  6     6 Female    37 Married        GCSE/O Level          British     White    
##  7     7 Male      53 Married        Degree                British     White    
##  8     8 Male      44 Single         Degree                English     White    
##  9     9 Male      40 Single         GCSE/CSE              English     White    
## 10    10 Female    41 Married        No Qualification      English     White    
## # ... with 1,681 more rows, and 7 more variables: gross_income <chr>,
## #   region <chr>, smoke <chr>, amt_weekends <dbl>, amt_weekdays <dbl>,
## #   type <chr>, smoke_amt_total <dbl>

Display mean of a column using summarize

summarize(smoking_data, mean(age))

## # A tibble: 1 x 1
##   `mean(age)`
##         <dbl>
## 1        49.8

Group smoking_data by gender using group_by and then summarize

gender <- group_by(smoking_data, gender)
summarize(gender, mean(age), median(age))

## # A tibble: 2 x 3
##   gender `mean(age)` `median(age)`
##   <chr>        <dbl>         <dbl>
## 1 Female        50.3            48
## 2 Male          49.2            48

Group smoking_data by gender and by smoke using group_by and then summarize

gender <- group_by(smoking_data, gender, smoke)
summarize(gender, mean(age), median(age))

## `summarise()` has grouped output by 'gender'. You can override using the `.groups` argument.

## # A tibble: 4 x 4
## # Groups:   gender [2]
##   gender smoke `mean(age)` `median(age)`
##   <chr>  <chr>       <dbl>         <dbl>
## 1 Female No           53.0            54
## 2 Female Yes          42.2            40
## 3 Male   No           51.2            51
## 4 Male   Yes          43.4            41

Count the number of smoking_data per gender

count(smoking_data, gender)

## # A tibble: 2 x 2
##   gender     n
##   <chr>  <int>
## 1 Female   965
## 2 Male     726

Spitiing the values in a column

head(smoking_data)

## # A tibble: 6 x 14
##      X1 gender   age marital_status highest_qualification nationality ethnicity
##   <dbl> <chr>  <dbl> <chr>          <chr>                 <chr>       <chr>    
## 1     1 Male      38 Divorced       No Qualification      British     White    
## 2     2 Female    42 Single         No Qualification      British     White    
## 3     3 Male      40 Married        Degree                English     White    
## 4     4 Female    40 Married        Degree                English     White    
## 5     5 Female    39 Married        GCSE/O Level          British     White    
## 6     6 Female    37 Married        GCSE/O Level          British     White    
## # ... with 7 more variables: gross_income <chr>, region <chr>, smoke <chr>,
## #   amt_weekends <dbl>, amt_weekdays <dbl>, type <chr>, smoke_amt_total <dbl>

Fetching unique values from a column

education_category <- unique(smoking_data$highest_qualification)

paste("The number of unique categories for education is : ", education_category)

## [1] "The number of unique categories for education is :  No Qualification" 
## [2] "The number of unique categories for education is :  Degree"           
## [3] "The number of unique categories for education is :  GCSE/O Level"     
## [4] "The number of unique categories for education is :  GCSE/CSE"         
## [5] "The number of unique categories for education is :  Other/Sub Degree" 
## [6] "The number of unique categories for education is :  Higher/Sub Degree"
## [7] "The number of unique categories for education is :  ONC/BTEC"         
## [8] "The number of unique categories for education is :  A Levels"

length(education_category)

## [1] 8

To check the datatype of each column

str(smoking_data)

## spec_tbl_df [1,691 x 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ X1                   : num [1:1691] 1 2 3 4 5 6 7 8 9 10 ...
##  $ gender               : chr [1:1691] "Male" "Female" "Male" "Female" ...
##  $ age                  : num [1:1691] 38 42 40 40 39 37 53 44 40 41 ...
##  $ marital_status       : chr [1:1691] "Divorced" "Single" "Married" "Married" ...
##  $ highest_qualification: chr [1:1691] "No Qualification" "No Qualification" "Degree" "Degree" ...
##  $ nationality          : chr [1:1691] "British" "British" "English" "English" ...
##  $ ethnicity            : chr [1:1691] "White" "White" "White" "White" ...
##  $ gross_income         : chr [1:1691] "2,600 to 5,200" "Under 2,600" "28,600 to 36,400" "10,400 to 15,600" ...
##  $ region               : chr [1:1691] "The North" "The North" "The North" "The North" ...
##  $ smoke                : chr [1:1691] "No" "Yes" "No" "No" ...
##  $ amt_weekends         : num [1:1691] NA 12 NA NA NA NA 6 NA 8 15 ...
##  $ amt_weekdays         : num [1:1691] NA 12 NA NA NA NA 6 NA 8 12 ...
##  $ type                 : chr [1:1691] NA "Packets" NA NA ...
##  $ smoke_amt_total      : num [1:1691] NA 24 NA NA NA NA 12 NA 16 27 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   X1 = col_double(),
##   ..   gender = col_character(),
##   ..   age = col_double(),
##   ..   marital_status = col_character(),
##   ..   highest_qualification = col_character(),
##   ..   nationality = col_character(),
##   ..   ethnicity = col_character(),
##   ..   gross_income = col_character(),
##   ..   region = col_character(),
##   ..   smoke = col_character(),
##   ..   amt_weekends = col_double(),
##   ..   amt_weekdays = col_double(),
##   ..   type = col_character()
##   .. )

To split a column in a dataset and assigning new column names

smoking_data[c('gross_income_lower_range', 'gross_income_upper_range')] <- str_split_fixed(smoking_data$gross_income, " ", 2)

head(smoking_data)

## # A tibble: 6 x 16
##      X1 gender   age marital_status highest_qualification nationality ethnicity
##   <dbl> <chr>  <dbl> <chr>          <chr>                 <chr>       <chr>    
## 1     1 Male      38 Divorced       No Qualification      British     White    
## 2     2 Female    42 Single         No Qualification      British     White    
## 3     3 Male      40 Married        Degree                English     White    
## 4     4 Female    40 Married        Degree                English     White    
## 5     5 Female    39 Married        GCSE/O Level          British     White    
## 6     6 Female    37 Married        GCSE/O Level          British     White    
## # ... with 9 more variables: gross_income <chr>, region <chr>, smoke <chr>,
## #   amt_weekends <dbl>, amt_weekdays <dbl>, type <chr>, smoke_amt_total <dbl>,
## #   gross_income_lower_range <chr>, gross_income_upper_range <chr>

Now removing the unwanted values from the splitted columns

smoking_data$gross_income_upper_range  <- gsub('to', '', as.character(smoking_data$gross_income_upper_range))
smoking_data

## # A tibble: 1,691 x 16
##       X1 gender   age marital_status highest_qualification nationality ethnicity
##    <dbl> <chr>  <dbl> <chr>          <chr>                 <chr>       <chr>    
##  1     1 Male      38 Divorced       No Qualification      British     White    
##  2     2 Female    42 Single         No Qualification      British     White    
##  3     3 Male      40 Married        Degree                English     White    
##  4     4 Female    40 Married        Degree                English     White    
##  5     5 Female    39 Married        GCSE/O Level          British     White    
##  6     6 Female    37 Married        GCSE/O Level          British     White    
##  7     7 Male      53 Married        Degree                British     White    
##  8     8 Male      44 Single         Degree                English     White    
##  9     9 Male      40 Single         GCSE/CSE              English     White    
## 10    10 Female    41 Married        No Qualification      English     White    
## # ... with 1,681 more rows, and 9 more variables: gross_income <chr>,
## #   region <chr>, smoke <chr>, amt_weekends <dbl>, amt_weekdays <dbl>,
## #   type <chr>, smoke_amt_total <dbl>, gross_income_lower_range <chr>,
## #   gross_income_upper_range <chr>

Replacing the values in the new column

smoking_data$gross_income_lower_range  <- sub("Under", "1,000", smoking_data$gross_income_lower_range)
smoking_data

## # A tibble: 1,691 x 16
##       X1 gender   age marital_status highest_qualification nationality ethnicity
##    <dbl> <chr>  <dbl> <chr>          <chr>                 <chr>       <chr>    
##  1     1 Male      38 Divorced       No Qualification      British     White    
##  2     2 Female    42 Single         No Qualification      British     White    
##  3     3 Male      40 Married        Degree                English     White    
##  4     4 Female    40 Married        Degree                English     White    
##  5     5 Female    39 Married        GCSE/O Level          British     White    
##  6     6 Female    37 Married        GCSE/O Level          British     White    
##  7     7 Male      53 Married        Degree                British     White    
##  8     8 Male      44 Single         Degree                English     White    
##  9     9 Male      40 Single         GCSE/CSE              English     White    
## 10    10 Female    41 Married        No Qualification      English     White    
## # ... with 1,681 more rows, and 9 more variables: gross_income <chr>,
## #   region <chr>, smoke <chr>, amt_weekends <dbl>, amt_weekdays <dbl>,
## #   type <chr>, smoke_amt_total <dbl>, gross_income_lower_range <chr>,
## #   gross_income_upper_range <chr>

smoking_data$gross_income_lower_range  <- sub("Above", "5,000", smoking_data$gross_income_lower_range)
smoking_data

## # A tibble: 1,691 x 16
##       X1 gender   age marital_status highest_qualification nationality ethnicity
##    <dbl> <chr>  <dbl> <chr>          <chr>                 <chr>       <chr>    
##  1     1 Male      38 Divorced       No Qualification      British     White    
##  2     2 Female    42 Single         No Qualification      British     White    
##  3     3 Male      40 Married        Degree                English     White    
##  4     4 Female    40 Married        Degree                English     White    
##  5     5 Female    39 Married        GCSE/O Level          British     White    
##  6     6 Female    37 Married        GCSE/O Level          British     White    
##  7     7 Male      53 Married        Degree                British     White    
##  8     8 Male      44 Single         Degree                English     White    
##  9     9 Male      40 Single         GCSE/CSE              English     White    
## 10    10 Female    41 Married        No Qualification      English     White    
## # ... with 1,681 more rows, and 9 more variables: gross_income <chr>,
## #   region <chr>, smoke <chr>, amt_weekends <dbl>, amt_weekdays <dbl>,
## #   type <chr>, smoke_amt_total <dbl>, gross_income_lower_range <chr>,
## #   gross_income_upper_range <chr>

Graphical Analysis:

library(ggplot2)
my_graph <- ggplot(smoking_data, aes(x = gender, y = smoke_amt_total)) +
    geom_point(aes(color = factor(smoke)))
my_graph +
    labs(
        title = "Amount of Smoking on the Weekends per Gender "
         )

## Warning: Removed 1270 rows containing missing values (geom_point).

new_graph<-ggplot(smoking_data, aes(x = log(age),
                              y = log(smoke_amt_total))) +
                    geom_point(aes(color = factor(smoke))) +
                    stat_smooth(method = "lm",
                                col = "#C42126",
                    se = FALSE, size = 1)
new_graph

## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 1271 rows containing non-finite values (stat_smooth).

## Warning: Removed 1270 rows containing missing values (geom_point).

ggplot(smoking_data, aes(x=gross_income_upper_range, y=smoke_amt_total)) + 
  geom_bar(stat="identity", position = "dodge")+
  scale_fill_brewer(palette = "Pastel2")

## Warning: Removed 1270 rows containing missing values (geom_bar).

perf <-ggplot(data=smoking_data, aes(x=gross_income_upper_range, y=smoke_amt_total,fill=gross_income_upper_range))+
  geom_bar(stat="identity")
perf

## Warning: Removed 1270 rows containing missing values (position_stack).

perf <-ggplot(data=smoking_data, aes(x=highest_qualification, y=smoke_amt_total,fill=highest_qualification))+
  geom_bar(stat="identity")
perf

## Warning: Removed 1270 rows containing missing values (position_stack).

ggplot(smoking_data, aes(x = gross_income_upper_range, y = smoke_amt_total, fill = gross_income_upper_range)) + 
  geom_boxplot() +
  stat_summary(fun = "mean", geom = "point", shape = 8,
               size = 2, color = "white")

## Warning: Removed 1270 rows containing non-finite values (stat_boxplot).

## Warning: Removed 1270 rows containing non-finite values (stat_summary).

# Brief Conclusion: * The amount of smoking on the weekends by male is slightly higher than the female. * There is a consistant slight increase increase in the amount of smoking by age. * The gropu with the gross income of around $10,000 tends to smoke the highest as compared to the others. * The ‘no qualification’ category is the highest group of smokers. * The spread of data is highest among the highest gross income class and the since the mean is greater than median, the data is right skewed.

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)

##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Final Project