Project 2

By Catherine Cho

Subproject 1: Alan Noel’s post

What means of control is most used now?

library(readr)
urlfile<-"https://raw.githubusercontent.com/alnoel/CUNYSPS-Data607/main/globaldataset_20200414.csv"
global<-read_csv(url(urlfile))

## Rows: 48801 Columns: 64

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (13): Datasource, gender, ageBroad, majorityStatus, majorityStatusAtExpl...
## dbl (50): yearOfRegistration, meansOfControlDebtBondage, meansOfControlTakes...
## lgl  (1): By using this data you agree to the Terms of Use: https://www.ctda...

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

library("dplyr")

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
#subsetting the data to contain only the variables related to means of control. The 
control<-global[grepl("meansOfControl",names(global))]
control<-control[-18:-19]
#dataframe "tally" is created to tally up incidents per means of control
tally<-control%>%
  gather(x,value,meansOfControlDebtBondage:meansOfControlOther)%>%
  group_by(x)%>%
  tally(value==1)
#finding location in dataframe "tally" of max incidents
loc<-which(tally==max(tally[,2]),arr.ind = TRUE)
#answer to question 1: Most used means of control
control_max_tally<-tally[loc[1,1],1]

#The results below shows that Psychological Abuse is the means of control most used now. 
control_max_tally

## # A tibble: 1 × 1
##   x                               
##   <chr>                           
## 1 meansOfControlPsychologicalAbuse

Which means of control is most used with females versus males?

library(tidyr)
control$gender<-global$gender
control

## # A tibble: 48,801 × 18
##    meansOfControlDebtBondage meansOfControlTa… meansOfControlR… meansOfControlT…
##                        <dbl>             <dbl>            <dbl>            <dbl>
##  1                       -99               -99              -99              -99
##  2                       -99               -99              -99              -99
##  3                       -99               -99              -99              -99
##  4                       -99               -99              -99              -99
##  5                       -99               -99              -99              -99
##  6                       -99               -99              -99              -99
##  7                       -99               -99              -99              -99
##  8                       -99               -99              -99              -99
##  9                       -99               -99              -99              -99
## 10                       -99               -99              -99              -99
## # … with 48,791 more rows, and 14 more variables:
## #   meansOfControlPsychologicalAbuse <dbl>, meansOfControlPhysicalAbuse <dbl>,
## #   meansOfControlSexualAbuse <dbl>, meansOfControlFalsePromises <dbl>,
## #   meansOfControlPsychoactiveSubstances <dbl>,
## #   meansOfControlRestrictsMovement <dbl>,
## #   meansOfControlRestrictsMedicalCare <dbl>,
## #   meansOfControlExcessiveWorkingHours <dbl>, …

#First, checking to see how many distinct values exist in dataframe. In this data set, data of Male and Female exist.
unique(control[c("gender")])

## # A tibble: 2 × 1
##   gender
##   <chr> 
## 1 Female
## 2 Male

#transform dataframe to long form
control_long<-control%>%
  pivot_longer(meansOfControlDebtBondage:meansOfControlOther,names_to="Control Type",values_to="occurence")

#summarizes two more columns that meet the conditional statements... female or male and has an occurence of 1
tally_gender<-control_long%>%
  group_by(`Control Type`)%>%
  summarise(
    female=sum(gender=="Female" & occurence==1),
    male=sum(gender=="Male" & occurence==1)
  )
tally_gender

## # A tibble: 17 × 3
##    `Control Type`                         female  male
##    <chr>                                   <int> <int>
##  1 meansOfControlDebtBondage                 828   453
##  2 meansOfControlExcessiveWorkingHours       856  1301
##  3 meansOfControlFalsePromises              1353  1403
##  4 meansOfControlOther                      1762   111
##  5 meansOfControlPhysicalAbuse              2982   650
##  6 meansOfControlPsychoactiveSubstances     2224    58
##  7 meansOfControlPsychologicalAbuse         3549  1296
##  8 meansOfControlRestrictsFinancialAccess    116     0
##  9 meansOfControlRestrictsMedicalCare        582   701
## 10 meansOfControlRestrictsMovement          3231  1184
## 11 meansOfControlSexualAbuse                1548    46
## 12 meansOfControlTakesEarnings              1444  1332
## 13 meansOfControlThreatOfLawEnforcement      389   390
## 14 meansOfControlThreats                    2941  1031
## 15 meansOfControlUsesChildren                117     3
## 16 meansOfControlWithholdsDocuments         1024  1052
## 17 meansOfControlWithholdsNecessities        798   571

#finds the location of max occurence and returns the control type
loc_female_max<-which(tally_gender==max(tally_gender[,2]),arr.ind = TRUE)
female_max<-tally_gender[loc_female_max[1],1]
loc_male_max<-which(tally_gender==max(tally_gender[,3]),arr.ind = TRUE)
male_max<-tally_gender[loc_male_max[1],1]
#The results below shows that Psychological Abuse has the highest tally for women out of all means of control.
female_max

## # A tibble: 1 × 1
##   `Control Type`                  
##   <chr>                           
## 1 meansOfControlPsychologicalAbuse

#The results below shows that False Promises has the highest tally for men out of all means of control.
male_max

## # A tibble: 1 × 1
##   `Control Type`             
##   <chr>                      
## 1 meansOfControlFalsePromises

Subproject 2: Catherine Cho’s Post, “Income and Religion”

https://www.pewforum.org/religious-landscape-study/income-distribution/#religious-tradition

1.Currently the data reports the proportion of people per income range identify with a Religiuos group. The data can tidyed in a different way in order to assess what proportion of people per religious group fits into a certain income category. Out of each religious group total, which income range has the greatest proportion of people?

library(readr)
library(tidyr)
library(stringr)
urlfile<-"https://raw.githubusercontent.com/catcho1632/607-project-2/main/income_religion.csv"
religion<-read_csv(url(urlfile))

## Rows: 12 Columns: 6

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Religious tradition, Less than $30,000, $30,000-$49,999, $50,000-$9...

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

#converting characters to numeric values
religion$`Less than $30,000`<-as.numeric(str_replace_all(religion$`Less than $30,000`,"%",""))/100
religion$`$30,000-$49,999`<-as.numeric(str_replace_all(religion$`$30,000-$49,999`,"%",""))/100
religion$`$50,000-$99,999`<-as.numeric(str_replace_all(religion$`$50,000-$99,999`,"%",""))/100
religion$`$100,000 or more`<-as.numeric(str_replace_all(religion$`$100,000 or more`,"%",""))/100

#converting percentages to frequency 
religion$`Less than $30,000`<-religion$`Less than $30,000`*religion$`Sample Size`
religion$`$30,000-$49,999`<-religion$`$30,000-$49,999`*religion$`Sample Size`
religion$`$50,000-$99,999`<-religion$`$50,000-$99,999`*religion$`Sample Size`
religion$`$100,000 or more`<-religion$`$100,000 or more`*religion$`Sample Size`

#converting to long form
religion_long<-religion%>%
  pivot_longer(`Less than $30,000`:`$100,000 or more`,names_to="salary_range",values_to="frequency")
#converting to wide form to "transpose" the dataframe
religion_wide<-religion_long%>%
  pivot_wider(names_from = `Religious tradition`,values_from=frequency)
religion_wide<-select(religion_wide,-1)

#transposed dataframe
religion_wide

## # A tibble: 48 × 13
##    salary_range      Buddhist Catholic `Evangelical Protestant` Hindu `Historically B…
##    <chr>                <dbl>    <dbl>                    <dbl> <dbl>            <dbl>
##  1 Less than $30,000     83.9      NA                       NA     NA               NA
##  2 $30,000-$49,999       41.9      NA                       NA     NA               NA
##  3 $50,000-$99,999       74.6      NA                       NA     NA               NA
##  4 $100,000 or more      30.3      NA                       NA     NA               NA
##  5 Less than $30,000     NA      2209.                      NA     NA               NA
##  6 $30,000-$49,999       NA      1166.                      NA     NA               NA
##  7 $50,000-$99,999       NA      1596.                      NA     NA               NA
##  8 $100,000 or more      NA      1166.                      NA     NA               NA
##  9 Less than $30,000     NA        NA                     2612.    NA               NA
## 10 $30,000-$49,999       NA        NA                     1642.    NA               NA
## # … with 38 more rows, and 7 more variables: Jehovah's Witness <dbl>,
## #   Jewish <dbl>, Mainline Protestant <dbl>, Mormon <dbl>, Muslim <dbl>,
## #   Orthodox Christian <dbl>, Unaffiliated (religious "nones") <dbl>

library(data.table)

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

religion_t<-data.table(religion_wide)[,lapply(.SD,function(x) x[order(is.na(x))])]
religion_t<-religion_t[1:4,]
religion_t

##         salary_range Buddhist Catholic Evangelical Protestant Hindu
## 1: Less than $30,000    83.88  2209.32                2611.70 29.24
## 2:   $30,000-$49,999    41.94  1166.03                1641.64 22.36
## 3:   $50,000-$99,999    74.56  1595.62                2089.36 58.48
## 4:  $100,000 or more    30.29  1166.03                1044.68 61.92
##    Historically Black Protestant Jehovah's Witness Jewish Mainline Protestant
## 1:                        903.12             99.84 113.28             1510.32
## 2:                        374.88             52.00 106.20             1041.60
## 3:                        289.68             45.76 169.92             1458.24
## 4:                        136.32              8.32 311.52             1197.84
##    Mormon Muslim Orthodox Christian Unaffiliated (religious "nones")
## 1: 160.38  69.70              27.90                           2240.7
## 2: 118.80  34.85              26.35                           1358.0
## 3: 196.02  59.45              55.80                           1765.4
## 4: 118.80  41.00              44.95                           1425.9

#Adding updated Sample Size column
religion_t$row_sum=rowSums(religion_t[,c(-1)])

#calculating proportion per salary_range
religion_final<-religion_t%>%
  mutate(
    across(c(2:13),
           .fns=~./row_sum)
  )
religion_final

##         salary_range    Buddhist  Catholic Evangelical Protestant       Hindu
## 1: Less than $30,000 0.008338486 0.2196278              0.2596283 0.002906740
## 2:   $30,000-$49,999 0.007007929 0.1948368              0.2743084 0.003736225
## 3:   $50,000-$99,999 0.009488069 0.2030493              0.2658797 0.007441823
## 4:  $100,000 or more 0.005420961 0.2086828              0.1869650 0.011081740
##    Historically Black Protestant Jehovah's Witness     Jewish
## 1:                    0.08977889       0.009925065 0.01126113
## 2:                    0.06264025       0.008688896 0.01774540
## 3:                    0.03686298       0.005823150 0.02162302
## 4:                    0.02439701       0.001489019 0.05575232
##    Mainline Protestant     Mormon      Muslim Orthodox Christian
## 1:           0.1501405 0.01594333 0.006928856        0.002773531
## 2:           0.1740453 0.01985078 0.005823231        0.004402931
## 3:           0.1855671 0.02494436 0.007565259        0.007100781
## 4:           0.2143758 0.02126148 0.007337716        0.008044642
##    Unaffiliated (religious "nones")  row_sum
## 1:                        0.2227473 10059.38
## 2:                        0.2269139  5984.65
## 3:                        0.2246545  7858.29
## 4:                        0.2551914  5587.57

#The following table can be further manipulated but now the table can be analyzed per income group rather than the income distribution within each group.The max proportion associated to each income group is listed in row_max. It appears that majority of the income groups fit into the evangelical protestant religion groups except for the highest earning group of $100,000 or more. In this case, the unaffiliated group has the majority. 

library(matrixStats)

## 
## Attaching package: 'matrixStats'

## The following object is masked from 'package:dplyr':
## 
##     count

religion_final$row_max=rowMaxs(as.matrix(religion_final[,c(-1,-14)]))
religion_final

##         salary_range    Buddhist  Catholic Evangelical Protestant       Hindu
## 1: Less than $30,000 0.008338486 0.2196278              0.2596283 0.002906740
## 2:   $30,000-$49,999 0.007007929 0.1948368              0.2743084 0.003736225
## 3:   $50,000-$99,999 0.009488069 0.2030493              0.2658797 0.007441823
## 4:  $100,000 or more 0.005420961 0.2086828              0.1869650 0.011081740
##    Historically Black Protestant Jehovah's Witness     Jewish
## 1:                    0.08977889       0.009925065 0.01126113
## 2:                    0.06264025       0.008688896 0.01774540
## 3:                    0.03686298       0.005823150 0.02162302
## 4:                    0.02439701       0.001489019 0.05575232
##    Mainline Protestant     Mormon      Muslim Orthodox Christian
## 1:           0.1501405 0.01594333 0.006928856        0.002773531
## 2:           0.1740453 0.01985078 0.005823231        0.004402931
## 3:           0.1855671 0.02494436 0.007565259        0.007100781
## 4:           0.2143758 0.02126148 0.007337716        0.008044642
##    Unaffiliated (religious "nones")  row_sum   row_max
## 1:                        0.2227473 10059.38 0.2596283
## 2:                        0.2269139  5984.65 0.2743084
## 3:                        0.2246545  7858.29 0.2658797
## 4:                        0.2551914  5587.57 0.2551914

Subproject 3: Victoria McEleney’s Post, “West Nile Virus”

The West Nile Virus dataset collected in the US by the CDC is summarized in the table below. Does more Neuroinvasive cases or Non-neuroinvasive cases lead to death?

library(stringr)
urlfile<-"https://raw.githubusercontent.com/catcho1632/607-project-2/main/West_Nile_Virus.csv"
virus<-read_csv(url(urlfile))

## New names:
## * `` -> ...4
## * `` -> ...5
## * `` -> ...6
## * `` -> ...7

## Rows: 24 Columns: 7

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): Neuroinvasive disease, Non-neuroinvasive disease, Total, ...4, ...5...

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

virus<-virus[3:23,1:5]
colnames(virus)<-c("year","Neuroinvasive_cases","Neuroinvasive_deaths","Non-Neuroinvasive_cases","Non-Neuroinvasive_deaths")

#String extract using regex for Neuroinvasive cases
virus$Neuroinvasive_cases<-str_replace_all(virus$Neuroinvasive_cases,",","")
virus$Neuroinvasive_cases<-as.numeric(str_extract_all(virus$Neuroinvasive_cases,"\\d+",simplify=TRUE))

#String extract using regex for Neuroinvasive deaths
virus$Neuroinvasive_deaths<-str_replace_all(virus$Neuroinvasive_deaths,"\\((\\d\\d|\\d)\\)","")
virus$Neuroinvasive_deaths<-str_extract_all(virus$Neuroinvasive_deaths,"\\d+|\\d",simplify=TRUE)
virus$Neuroinvasive_deaths<-as.numeric(unlist(str_extract(virus$Neuroinvasive_deaths, "\\d+|\\d")))

#String extract using regex for Non-Neuroinvasive cases
virus$`Non-Neuroinvasive_cases`<-str_replace_all(virus$`Non-Neuroinvasive_cases`,",","")
virus$`Non-Neuroinvasive_cases`<-as.numeric(str_extract_all(virus$`Non-Neuroinvasive_cases`,"\\d+|\\d",simplify = TRUE))

#String extract using regex for Non-Neuroinvasive deaths
virus$`Non-Neuroinvasive_deaths`<-str_replace_all(virus$`Non-Neuroinvasive_deaths`,"\\((\\d\\d|\\d)\\)","")
virus$`Non-Neuroinvasive_deaths`<-str_replace_all(virus$`Non-Neuroinvasive_deaths`,"\\(\\<\\d\\)","")
virus$`Non-Neuroinvasive_deaths`<-str_extract_all(virus$`Non-Neuroinvasive_deaths`,"\\d+|\\d",simplify=TRUE)
virus$`Non-Neuroinvasive_deaths`<-as.numeric(unlist(str_extract(virus$`Non-Neuroinvasive_deaths`, "\\d+|\\d")))

#converting deaths to proportions of cases that lead to death
virus$Neuroinvasive_deaths<-virus$Neuroinvasive_deaths/virus$Neuroinvasive_cases
virus$`Non-Neuroinvasive_deaths`<-virus$`Non-Neuroinvasive_deaths`/virus$`Non-Neuroinvasive_cases`

#The proportions are weighted per cases in a year
virus$Neuro_x_weight<-virus$Neuroinvasive_deaths*virus$Neuroinvasive_cases
virus$Non_Neuro_x_weight<-virus$`Non-Neuroinvasive_deaths`*virus$`Non-Neuroinvasive_cases`
Neuro_weighted_avg<-round(sum(virus$Neuro_x_weight)/sum(virus$Neuroinvasive_cases)*100)
Non_Neuro_weighted_avg<-round(sum(virus$Non_Neuro_x_weight)/sum(virus$`Non-Neuroinvasive_cases`)*100)
Neuro_weighted_avg

## [1] 9

Non_Neuro_weighted_avg

## [1] 0

#It is apparent that more neuroinvasive cases lead to death than non-neuroinvasive cases. It appears that neuroinvasive cases recover 90% of the time and 99.5% of the time non-neuroinvasive cases.