#CUNY SPS Bridge: R section, Assignment 2
#Name: Chinedu Onyeka
#Date: July 27th, 2021
#Problem 1: Use the summary function to get an overview of the data set. Then display the mean and median
#for at least two attributes

#First read the file from its location in github
affairs_url = "https://github.com/chinedu2301/CUNY-SPS-Bridge/raw/main/Affairs.csv"
Affairs <- read.table(file=affairs_url, header = TRUE, sep = ",")

head(Affairs)
##    X affairs gender age yearsmarried children religiousness education
## 1  4       0   male  37        10.00       no             3        18
## 2  5       0 female  27         4.00       no             4        14
## 3 11       0 female  32        15.00      yes             1        12
## 4 16       0   male  57        15.00      yes             5        18
## 5 23       0   male  22         0.75       no             2        17
## 6 29       0 female  32         1.50       no             2        17
##   occupation rating
## 1          7      4
## 2          6      4
## 3          1      4
## 4          6      5
## 5          6      3
## 6          5      5
#Check the summary of the data using the summary function
summary(Affairs)
##        X           affairs          gender               age       
##  Min.   :   4   Min.   : 0.000   Length:601         Min.   :17.50  
##  1st Qu.: 528   1st Qu.: 0.000   Class :character   1st Qu.:27.00  
##  Median :1009   Median : 0.000   Mode  :character   Median :32.00  
##  Mean   :1060   Mean   : 1.456                      Mean   :32.49  
##  3rd Qu.:1453   3rd Qu.: 0.000                      3rd Qu.:37.00  
##  Max.   :9029   Max.   :12.000                      Max.   :57.00  
##   yearsmarried      children         religiousness     education    
##  Min.   : 0.125   Length:601         Min.   :1.000   Min.   : 9.00  
##  1st Qu.: 4.000   Class :character   1st Qu.:2.000   1st Qu.:14.00  
##  Median : 7.000   Mode  :character   Median :3.000   Median :16.00  
##  Mean   : 8.178                      Mean   :3.116   Mean   :16.17  
##  3rd Qu.:15.000                      3rd Qu.:4.000   3rd Qu.:18.00  
##  Max.   :15.000                      Max.   :5.000   Max.   :20.00  
##    occupation        rating     
##  Min.   :1.000   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:3.000  
##  Median :5.000   Median :4.000  
##  Mean   :4.195   Mean   :3.932  
##  3rd Qu.:6.000   3rd Qu.:5.000  
##  Max.   :7.000   Max.   :5.000
#The mean of the age is 32.49 and the median for the age is 32.0
#The mean of the yearsmarried is 8.178 and the media for yearsmarried is  7.00
#Problem 2: Create a new data frame with a subset of the columns and rows. Make sure to rename it.

# I created a subset of the Affairs data frame from the 100th - 400th row and the 3rd - 9th column.
Affairs_subset <- Affairs[100:400, 3:9]
head(Affairs_subset)
##     gender age yearsmarried children religiousness education occupation
## 100   male  37       15.000      yes             4        20          6
## 101 female  27       10.000      yes             5        14          1
## 102   male  37       10.000      yes             2        18          6
## 103 female  22        0.125       no             4        12          4
## 104   male  57       15.000      yes             5        20          6
## 105 female  37       15.000      yes             4        18          6
#Problem 3: Create new column names for the new data frame
colnames(Affairs_subset) <- c("sex", "age", "numberofyearsmarried", "children", "religiousness", "yearsofeducation", "profession")
head(Affairs_subset)
##        sex age numberofyearsmarried children religiousness yearsofeducation
## 100   male  37               15.000      yes             4               20
## 101 female  27               10.000      yes             5               14
## 102   male  37               10.000      yes             2               18
## 103 female  22                0.125       no             4               12
## 104   male  57               15.000      yes             5               20
## 105 female  37               15.000      yes             4               18
##     profession
## 100          6
## 101          1
## 102          6
## 103          4
## 104          6
## 105          6
#I changed the name of the "gender" column to "sex", "occupation" to "profession", education to "yearsofeducation"
#and "yearsmarried" to "numberofyearsmarried"
#Problem 4: Use the summary function to create an overview of the new data frame. 
#Print the mean and median for the same two attributes. Please compare.

summary(Affairs_subset)
##      sex                 age        numberofyearsmarried   children        
##  Length:301         Min.   :17.50   Min.   : 0.125       Length:301        
##  Class :character   1st Qu.:27.00   1st Qu.: 1.500       Class :character  
##  Mode  :character   Median :32.00   Median : 7.000       Mode  :character  
##                     Mean   :32.35   Mean   : 7.751                         
##                     3rd Qu.:37.00   3rd Qu.:15.000                         
##                     Max.   :57.00   Max.   :15.000                         
##  religiousness   yearsofeducation   profession 
##  Min.   :1.000   Min.   : 9.00    Min.   :1.0  
##  1st Qu.:2.000   1st Qu.:14.00    1st Qu.:3.0  
##  Median :3.000   Median :16.00    Median :5.0  
##  Mean   :3.213   Mean   :16.15    Mean   :4.1  
##  3rd Qu.:4.000   3rd Qu.:18.00    3rd Qu.:5.0  
##  Max.   :5.000   Max.   :20.00    Max.   :7.0
#The mean of the age in the Affairs data frame is 32.49, while mean of age in Affairs_subset is 32.35
#The median of the age in the Affairs data frame is 32.0, while median of age in Affairs_subset is 32.00
#From the results above, there is no significant difference in the mean and median of the Affairs data when compared to that of the subset (Affairs_subset)


#The mean of the yearsmarried in the Affairs data frame is 8.178, while the mean of numberofyearsmarried is 7.751
#The median of the yearsmarried in the Affairs data frame is 7.0, while the meadian of numberofyearsmarried is 7.0
#From the results shown above, the median of yearsmarried from the Affairs data frame and that of the numberofyearsmarried in the Affairs_subset data frame are exactly the same while a slight difference exists in their mean values.
#Problem 5: For at least 3 values in a column, please rename so that every value in that column is renamed.
Affairs_subset$sex[Affairs_subset$sex == "male"] <- "M"
Affairs_subset$sex[Affairs_subset$sex == "female"] <- "F"
Affairs_subset$children[Affairs_subset$children == "yes"] <- "Y"
Affairs_subset$children[Affairs_subset$children == "no"] <- "N"
head(Affairs_subset)
##     sex age numberofyearsmarried children religiousness yearsofeducation
## 100   M  37               15.000        Y             4               20
## 101   F  27               10.000        Y             5               14
## 102   M  37               10.000        Y             2               18
## 103   F  22                0.125        N             4               12
## 104   M  57               15.000        Y             5               20
## 105   F  37               15.000        Y             4               18
##     profession
## 100          6
## 101          1
## 102          6
## 103          4
## 104          6
## 105          6
#I changed all "male" and "female" in the sex column to M and F respectively.
#Also, I changed all "yes" and "no" in the children column to Y and N respectively.
#Problem 6: Display enough rows to see examples of all steps 1-5 above

#Answers: Output is shown in each case in 1-5 above.
#Problem 7: Place the original .csv in a github file and have R read from the link.

#Answer: See solution in problem 1