reading <- read.csv("half_term_report.csv", stringsAsFactors = FALSE)
dim(reading)
## [1] 311 16
summary(reading)
## X Time Nickname Book_title
## Min. : 1.0 Length:311 Length:311 Length:311
## 1st Qu.: 78.5 Class :character Class :character Class :character
## Median :156.0 Mode :character Mode :character Mode :character
## Mean :156.0
## 3rd Qu.:233.5
## Max. :311.0
## Author Publisher Genre
## Length:311 Length:311 Length:311
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## Summary Opinion Stars Campus
## Length:311 Length:311 Min. :1.000 Length:311
## Class :character Class :character 1st Qu.:2.000 Class :character
## Mode :character Mode :character Median :3.000 Mode :character
## Mean :2.945
## 3rd Qu.:4.000
## Max. :5.000
## Gender Teacher_Assessment Plagiarism_Source
## Length:311 Min. :0.00 Length:311
## Class :character 1st Qu.:2.00 Class :character
## Mode :character Median :3.00 Mode :character
## Mean :2.83
## 3rd Qu.:3.00
## Max. :4.00
## Teacher_Assessment_2 Period
## Mode:logical Length:311
## NA's:311 Class :character
## Mode :character
##
##
##
str(reading)
## 'data.frame': 311 obs. of 16 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Time : chr "08/05/2018 13:02:23" "15/05/2018 12:57:50" "01/05/2018 08:18:28" "15/05/2018 10:01:16" ...
## $ Nickname : chr "ryo" "ryo" "mako" "mako" ...
## $ Book_title : chr "Six Sketches" "Marcel and the Shakespeare Letters" "A Christmas Carol" "A Little Princess" ...
## $ Author : chr "Leslie Dunkling" "Stephen Rabley" "Charles Dickens" "FRANCES HODGSON BURNETT" ...
## $ Publisher : chr "Penguin" "Pearson" "Pearson" "Oxford" ...
## $ Genre : chr "children's literature" "children's literature" "fantasy" "other" ...
## $ Summary : chr "There are six funny stories written.\n1. I'm Right !\nNorma and Corin forgot the train ticket, also ride on wro"| __truncated__ "The Story of two mice. Shakespeare letter bring Professor Barton's flat." "This story is set in London in the 19th century.\nScrooge of the hero had been chased by work for Christmas.\nH"| __truncated__ "This story is the story of the 19th century England. The main character is Sara. Her father Ralph was very rich"| __truncated__ ...
## $ Opinion : chr "I think Norma is stupid." "This two mice is so clever." "I felt I could be happier as I got more kind to others." "I was impressed by Sara's strong way of living. And though it was painful like Sara, I thought that happiness w"| __truncated__ ...
## $ Stars : int 1 1 2 2 1 2 3 3 3 4 ...
## $ Campus : chr "Tokyo" "Tokyo" "Tokyo" "Tokyo" ...
## $ Gender : chr "Male" "Male" "Female" "Female" ...
## $ Teacher_Assessment : int 2 1 3 4 3 2 3 3 3 4 ...
## $ Plagiarism_Source : chr "" "" "" "" ...
## $ Teacher_Assessment_2: logi NA NA NA NA NA NA ...
## $ Period : chr NA NA NA NA ...
reading <- reading[!duplicated(reading$Summary),]
dim(reading)
## [1] 307 16
vars <- c("Nickname","Book_title", "Author","Publisher",
"Genre", "Stars", "Campus","Gender", "Period")
reading[,vars] <- lapply(reading[,vars], factor)
reading$Time <- as.POSIXct(reading$Time, format= "%d/%m/%Y %H:%M:%S")
str(reading)
## 'data.frame': 307 obs. of 16 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Time : POSIXct, format: "2018-05-08 13:02:23" "2018-05-15 12:57:50" ...
## $ Nickname : Factor w/ 58 levels "alice","asahi",..: 38 38 22 22 22 22 22 22 22 57 ...
## $ Book_title : Factor w/ 209 levels "a chrismas carol",..: 121 84 2 4 3 188 88 207 204 58 ...
## $ Author : Factor w/ 159 levels "alan battersby",..: 89 143 24 52 131 17 132 85 85 74 ...
## $ Publisher : Factor w/ 6 levels "Cambridge","Macmillan",..: 6 5 5 4 1 5 5 4 4 3 ...
## $ Genre : Factor w/ 13 levels "action adventure",..: 3 3 5 9 7 1 5 11 9 1 ...
## $ Summary : chr "There are six funny stories written.\n1. I'm Right !\nNorma and Corin forgot the train ticket, also ride on wro"| __truncated__ "The Story of two mice. Shakespeare letter bring Professor Barton's flat." "This story is set in London in the 19th century.\nScrooge of the hero had been chased by work for Christmas.\nH"| __truncated__ "This story is the story of the 19th century England. The main character is Sara. Her father Ralph was very rich"| __truncated__ ...
## $ Opinion : chr "I think Norma is stupid." "This two mice is so clever." "I felt I could be happier as I got more kind to others." "I was impressed by Sara's strong way of living. And though it was painful like Sara, I thought that happiness w"| __truncated__ ...
## $ Stars : Factor w/ 5 levels "1","2","3","4",..: 1 1 2 2 1 2 3 3 3 4 ...
## $ Campus : Factor w/ 2 levels "Saitama","Tokyo": 2 2 2 2 2 2 2 2 2 2 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 1 1 1 1 1 1 1 2 ...
## $ Teacher_Assessment : int 2 1 3 4 3 2 3 3 3 4 ...
## $ Plagiarism_Source : chr "" "" "" "" ...
## $ Teacher_Assessment_2: logi NA NA NA NA NA NA ...
## $ Period : Factor w/ 3 levels "Ikebukuro period 2 (10:45am to 12:15pm)",..: NA NA NA NA NA NA NA NA NA NA ...
library(quanteda)
## Package version: 1.3.0
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
sentences <- tokens(reading$Summary, what="sentence")
typeof(sentences)
## [1] "list"
sentencesDf <- as.data.frame(table(unlist(sentences)))
head(sentencesDf)
## Var1
## 1 1804,Britain was at war with France.
## 2 There are two stories in this book .
## 3 This is a horror novel.
## 4 A sunny day, Alice was reading a book under a tree with her sisiter.
## 5 American Life explains American history and culture and life style.
## 6 Aramis who is bishop visited Bastille prison.
## Freq
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
duplicates <- sentencesDf[sentencesDf$Freq>1,]
nrow(duplicates)*2/nrow(sentencesDf)
## [1] 0.01967593
duplicates
## Var1
## 64 1.
## 104 A letter is written what he is Washington now.
## 161 After all , Beth recovered , like that their father returned to their home.
## 321 Amy is fourth daughter.
## 567 Beth is third daughter.She doesn't go to school.
## 594 But their father return to this country from the war.Because he were sick.
## 724 But they finally dyed their hands on foul play.
## 903 Everyone's wish come true.
## 970 For example, They give their mother a christmas present and be provided the dinner for Lawrence of neighbor.
## 986 Four sisters lives strenuously ,but such time Beth becomes sick.
## 987 Four sisters tried to live strenuously.
## 988 Four students went to see football with the teacher one day.
## 1313 He was disappointed.
## 1351 He was very sad.
## 1767 Jo is second daughter.She is fifteen years old.
## 1856 Lisa and Alice are going to stay the Hotel Oracle.they are enjoy here.
## 1920 Meg is eldest daughter of a March family.She is sixteen years old.She's working as a private teacher.
## 2159 One day,they heared a rumor that Matt Lepadi and Claudia Carman who are Claudia's fiancee are staying on this island.When Lisa on the balcony,she saw two peaple on the balcony of the hotel Astra.they quarrel.Lisa are impatient, but She know this is practice their scens after fight.
## 2247 Rick Evelyn and Alex found the gold bracelet of Anubis in Egypt ruins.
## 2357 She is twenteen years old.She goes to school.
## 2370 She loved him too.
## 2453 She's working at aunt's house.
## 2649 That practice was hard and difficult, but everyone helped and got over it.
## 2735 The main character of this story is the four unwilling students of a public high school.
## 2779 The public high school they attended was low-priced because they had little money and were not smart.
## 2816 The teacher decides to compete with private high school by Tetraslon as a trigger.
## 2843 Their father went to a war because there are no enough money.
## 3006 There was a good private high school which is rich near this high school, but it did not become a comparison.
## 3078 They encounter a rare competition called Tetraslon.
## 3085 They gave a letter from their father.
## 3090 They had a lot of people cooperate, such as school and parents.
## 3144 They tried practicing for a year and came to the game.
## 3254 This story of main character is four people.
## 3272 Though they were falling students, motivation and love gradually started growing.
## Freq
## 64 3
## 104 2
## 161 2
## 321 2
## 567 2
## 594 2
## 724 2
## 903 2
## 970 2
## 986 2
## 987 2
## 988 2
## 1313 2
## 1351 3
## 1767 2
## 1856 2
## 1920 2
## 2159 2
## 2247 2
## 2357 2
## 2370 2
## 2453 2
## 2649 2
## 2735 2
## 2779 2
## 2816 2
## 2843 2
## 3006 2
## 3078 2
## 3085 2
## 3090 2
## 3144 2
## 3254 2
## 3272 2
for(i in 1:nrow(duplicates)){
print(reading[grep(duplicates$Var1[i], reading$Summary),][3:4])
}
## Nickname Book_title
## 1 ryo six sketches
## 3 mako a christmas carol
## 4 mako a little princess
## 13 yuto_y mother teresa
## 14 yuto_y marcel goes to hollywood
## 25 tomoya suffer!
## 29 hide the swiss family robinson
## 36 hide american life
## 43 jun new york
## 46 jun michael jordan
## 47 jun jennifer lopez
## 52 atsuhito david beckham
## 54 atsuhito audrey hepburn
## 74 fumiya brazil american republics series no.3
## 77 fumiya gulliver ‘s travel in lilliput
## 81 taichi lucky number
## 88 yuto_s twenty thousand leagues under the sea
## 100 shintaro a tale of two cities
## 101 shintaro the man in the iron mask
## 113 ayumi robinson crusoe
## 115 yuki this is london
## 117 yuki barack obama
## 119 yuki the death of karen silkwood
## 121 harumi the cellist of sarajevo
## 122 harumi eye of the storm
## 123 harumi taste and other tales
## 125 harumi tales of the supernatural
## 143 kouki lucky number
## 154 genki the mummy
## 157 genki the death of karen silkwood
## 159 nono the jungle book
## 166 yumi stories from the five towns
## 167 yumi tooth and claw
## 168 yumi superbird
## 169 yumi simply suspence
## 170 yumi love story
## 171 yumi bad company
## 172 yumi changing their skies
## 173 yumi king's ransom
## 175 natsu k's first case
## 180 natsu the black tulip
## 209 sera the beatles
## 210 sera the piano
## 211 sera princess diana
## 212 kazuma this is london
## 222 miki the mysterious island
## 231 mayu tales from the arabian nights
## 233 mayu the adventure of tom sawyer
## 240 eishi american life
## 245 yuji jumanji
## 246 yuna the adventures of huckleberry finn
## 247 yuna northanger abbey
## 250 yuna the last of the mohican
## 252 yuna good wives
## 253 yuna a tale of two cities
## 263 minako the scarlet letter
## 265 minako wuthering heights
## 278 atsushi ten long years
## 279 minami billy budd
## 280 minami northanger abbey
## 281 minami jane eyre
## 282 minami the trumpet-major
## 283 minami the last of mohicans
## 287 tmk washington square
## 294 rino the black tulip
## 301 mike michael jordan
## 309 kandai northanger abbey
## Nickname Book_title
## 289 tmk little woman
## 291 tmk little women
## Nickname Book_title
## 289 tmk little woman
## 291 tmk little women
## Nickname Book_title
## 289 tmk little woman
## 291 tmk little women
## Nickname Book_title
## 289 tmk little woman
## 291 tmk little women
## Nickname Book_title
## 289 tmk little woman
## 291 tmk little women
## Nickname Book_title
## 8 mako who,sir? me,sir?'
## 9 mako who sir? me,sir?
## Nickname Book_title
## 289 tmk little woman
## 291 tmk little women
## Nickname Book_title
## 289 tmk little woman
## 291 tmk little women
## Nickname Book_title
## 289 tmk little woman
## 291 tmk little women
## Nickname Book_title
## 289 tmk little woman
## 291 tmk little women
## Nickname Book_title
## 8 mako who,sir? me,sir?'
## 9 mako who sir? me,sir?
## Nickname Book_title
## 67 shimpei l.a.raid
## 256 kasumi the house on the hill
## Nickname Book_title
## 264 minako the house on the hill
## 275 alice the house on the hill
## 279 minami billy budd
## Nickname Book_title
## 289 tmk little woman
## 291 tmk little women
## Nickname Book_title
## 295 mri shootingstars
## 296 mri lisa and alice are good friends
## Nickname Book_title
## 289 tmk little woman
## 291 tmk little women
## Nickname Book_title
## 295 mri shootingstars
## 296 mri lisa and alice are good friends
## Nickname Book_title
## 73 fumiya the mummy returns
## 84 yuto_s the mummy returns
## Nickname Book_title
## 289 tmk little woman
## 291 tmk little women
## Nickname Book_title
## 54 atsuhito audrey hepburn
## 130 mirei tales from the arabian nights
## Nickname Book_title
## 289 tmk little woman
## 291 tmk little women
## Nickname Book_title
## 8 mako who,sir? me,sir?'
## 9 mako who sir? me,sir?
## Nickname Book_title
## 8 mako who,sir? me,sir?'
## 9 mako who sir? me,sir?
## Nickname Book_title
## 8 mako who,sir? me,sir?'
## 9 mako who sir? me,sir?
## Nickname Book_title
## 8 mako who,sir? me,sir?'
## 9 mako who sir? me,sir?
## Nickname Book_title
## 289 tmk little woman
## 291 tmk little women
## Nickname Book_title
## 8 mako who,sir? me,sir?'
## 9 mako who sir? me,sir?
## Nickname Book_title
## 8 mako who,sir? me,sir?'
## 9 mako who sir? me,sir?
## Nickname Book_title
## 289 tmk little woman
## 291 tmk little women
## Nickname Book_title
## 8 mako who,sir? me,sir?'
## 9 mako who sir? me,sir?
## Nickname Book_title
## 8 mako who,sir? me,sir?'
## 9 mako who sir? me,sir?
## Nickname Book_title
## 289 tmk little woman
## 291 tmk little women
## Nickname Book_title
## 8 mako who,sir? me,sir?'
## 9 mako who sir? me,sir?
reading[grep("Rick Evelyn and Alex found the gold bracelet of Anubis in Egypt ruins.", reading$Summary),][3:4]
## Nickname Book_title
## 73 fumiya the mummy returns
## 84 yuto_s the mummy returns
reading[grep("When Lisa on the balcony,she saw two peaple on the balcony of the hotel Astra.they quarrel.", reading$Summary),][3:4]
## Nickname Book_title
## 295 mri shootingstars
## 296 mri lisa and alice are good friends
reading <- reading[!(reading$Nickname == 'mri' & reading$Teacher_Assessment==0),]
dim(reading)
## [1] 306 16
reading[grep("For example, They give their mother a christmas present and be provided the dinner for Lawrence of neighbor.", reading$Summary),][3:4]
## Nickname Book_title
## 289 tmk little woman
## 291 tmk little women
reading <- reading[!(reading$Nickname=="tmk" & reading$Book_title=="little woman"),]
dim(reading)
## [1] 305 16
reading[grep("The public high school they attended was low-priced because they had little money and were not smart.", reading$Summary),][3:4]
## Nickname Book_title
## 8 mako who,sir? me,sir?'
## 9 mako who sir? me,sir?
identical(reading$Summary[8], reading$Summary[9])
## [1] FALSE
reading <- reading[!(reading$Nickname=="mako" & reading$Book_title=="who,sir? me,sir?'"),]
dim(reading)
## [1] 304 16
reading[grep("His name was Dorian and he was young and very beautiful.", reading$Summary),]
## X Time Nickname Book_title Author
## 292 292 2018-05-12 00:08:56 rino the picture of dorian gray oscar wild
## 293 293 2018-06-08 01:44:21 rino the picture of dorian gray oscar wild
## Publisher Genre
## 292 Macmillan historical fiction
## 293 Macmillan mystery
## Summary
## 292 Henly and Basil were good friends who studied at the university.\nbaysil was the artist I coud paint the best picture ever.\nWhich is because of a boy.His name was Dorian and he was young and very beautiful.Also Baysil refused.Henly met Dolian and talk about his life in various ways.\nBaysil drew a picture of Dolian,but he was not pleased.\nBecause he thought that he was young and beautiful so that sorrounding people cared about himself,and he hopes that the painting should take his age instead of himself.\nthat wish will come true.\nThe picture was wrinkled after a day and it became ugly and did bad things and become cruei face.But he remainded young aand beautiful.\nhe kept hiding the picture,but he was afraid to find it by second and tried to break the picture.\nWhen he broke the picture he returned to the real form and died.\n
## 293 Henry and Basil was good friends who studeied at the university.\nBayzil was the artist.I could paint the best picture ever.\nWhich is because of a boy.\nHis name was Dorian and he was young and very beautiful. Althou Baysil refused.\nHenry met Dorian and talk about his life in various ways.\nBAysil was drew a picture of Dorian, but he was not pleased.\nBecause he thought that he was young and bwautiful so that sorrounding people cared about hinself, and he hopes that the painting should take his age instead of himself.\nTHat wish will come true.\nThe picture was wrinkled after a day and it become ugly and did bad things and became crusl face.\nBut he remainded young and beautiful.\nHe kept hiding the picture, but he was afraid to find it by second and tried and break the picture.\nWhen he broke the picture he turened to the real form and died.\n
## Opinion
## 292 this story was very sad.\nI didn't think there would be such a story.\nIt seems not to be very popular in Japan.\n
## 293 I think this story was sad story.\nIt is because he thinks that Basil wanted to purely painting.\nAnd I think Dorian was just proud of his beauty and youth.So, I think Henly is bad people.\nI don't like people like Henly.\nDorian reflected on bad things, but in the end his died.\nI though it was a mirror cares.\nAnd it was a piece to Dorian who had a free way of living.\nSO, I think that I wanted to be a human being who can distinguish between mistakes.\nFinaly, there are not so many bad-end works in Japan.\nSO,I think very interested.\n \n
## Stars Campus Gender Teacher_Assessment Plagiarism_Source
## 292 2 Saitama Female 2
## 293 4 Saitama Female 3
## Teacher_Assessment_2 Period
## 292 NA <NA>
## 293 NA Niiza period 4 (3:00pm to 4:30pm)
reading <- reading[!(reading$Nickname=="rino" & reading$Book_title == "the picture of dorian gray" & reading$Genre == "historical fiction"),]
dim(reading)
## [1] 303 16
sentences <- tokens(reading$Summary, what="sentence")
sentencesDf <- as.data.frame(table(unlist(sentences)))
duplicates <- sentencesDf[sentencesDf$Freq>1,]
nrow(duplicates)*2/nrow(sentencesDf)
## [1] 0.002902758
duplicates$Var1
## [1] 1.
## [2] He was disappointed.
## [3] He was very sad.
## [4] Rick Evelyn and Alex found the gold bracelet of Anubis in Egypt ruins.
## [5] She loved him too.
## 3445 Levels: 1804,Britain was at war with France. ...
reading$Month <- as.factor(months(reading$Time))
table(reading$Month)
##
## April June May
## 77 40 186
reading$Week = cut(reading$Time, breaks="weeks")
levels(reading$Week) <- paste("W", 1:nlevels(reading$Week), sep="")
table(reading$Week)
##
## W1 W2 W3 W4 W5 W6 W7 W8 W9
## 1 25 46 43 44 48 38 34 24
table(table(reading$Week, reading$Nickname))
##
## 0 1 2 3 4
## 249 249 20 2 2
reading$Day <- as.factor(weekdays(reading$Time))
sort(table(reading$Day))
##
## Wednesday Sunday Saturday Tuesday Monday Thursday Friday
## 17 19 20 40 48 74 85
Tokyo <- reading[reading$Campus=="Tokyo",]
Tokyo$Nickname <- factor(Tokyo$Nickname)
Saitama <- reading[reading$Campus=="Saitama",]
Saitama$Nickname <- factor(Saitama$Nickname)
table(Tokyo$Nickname)
##
## atsuhito fumiya hide jun kakuto koharu mai mako
## 7 6 8 6 7 5 3 6
## miku moeka rio ryo sena shimpei shintaro taichi
## 7 6 3 2 3 7 6 5
## tomo tomoya yuto_s yuto_y
## 1 4 7 7
SaitamaPeriod3 <- subset(reading, reading$Period=="Niiza period 3 (1:15pm to 2:45pm)")
SaitamaPeriod3$Nickname <- factor(SaitamaPeriod3$Nickname)
SaitamaPeriod4 <- subset(reading, reading$Period=="Niiza period 4 (3:00pm to 4:30pm)")
SaitamaPeriod4$Nickname <- factor(SaitamaPeriod4$Nickname)
table(SaitamaPeriod3$Nickname)
##
## asahi aya ayumi chacha genki harumi kazuma kouki manabu mayu
## 3 8 5 2 8 9 7 7 4 8
## miki mirei moena natsu nono sakura sera yuki yumi yuuka
## 7 6 8 7 4 8 8 6 8 5
table(SaitamaPeriod4$Nickname)
##
## alice atsushi eishi kandai kasumi kensuke mike minako minami
## 5 1 4 4 3 5 7 5 5
## mri rino saki shine shun sit tmk yuji yuna
## 3 2 4 1 2 1 5 3 8
table(reading$Nickname, reading$Week)
##
## W1 W2 W3 W4 W5 W6 W7 W8 W9
## alice 0 1 0 2 1 0 0 1 0
## asahi 0 1 0 1 0 1 0 1 0
## atsuhito 0 0 1 1 1 1 0 1 2
## atsushi 0 0 0 0 0 0 1 0 0
## aya 0 1 1 1 1 1 1 1 1
## ayumi 0 0 1 1 1 0 1 1 0
## chacha 0 1 1 0 0 0 0 0 0
## eishi 0 1 1 0 1 0 0 0 1
## fumiya 0 0 1 1 1 1 1 1 0
## genki 0 0 1 0 0 4 1 1 1
## harumi 1 0 1 1 1 1 1 3 0
## hide 0 1 1 1 1 1 1 1 1
## jun 0 1 0 2 1 0 2 0 0
## kakuto 0 0 2 0 1 1 1 1 1
## kandai 0 0 1 1 1 1 0 0 0
## kasumi 0 0 2 1 0 0 0 0 0
## kazuma 0 0 0 0 4 1 1 0 1
## kensuke 0 1 0 1 1 0 1 0 1
## koharu 0 0 1 0 1 1 1 0 1
## kouki 0 1 1 1 1 1 1 0 1
## mai 0 0 1 1 0 1 0 0 0
## mako 0 0 1 1 1 1 1 0 1
## manabu 0 0 0 1 0 1 1 1 0
## mayu 0 1 1 1 1 1 1 1 1
## mike 0 1 1 1 1 1 0 1 1
## miki 0 1 1 1 1 1 1 1 0
## miku 0 0 1 1 0 1 1 2 1
## minako 0 0 1 0 1 1 1 0 1
## minami 0 1 1 1 0 1 0 1 0
## mirei 0 1 1 1 1 0 2 0 0
## moeka 0 0 1 1 2 0 1 1 0
## moena 0 1 1 1 1 1 1 1 1
## mri 0 0 0 0 0 1 2 0 0
## natsu 0 1 1 1 1 1 1 1 0
## nono 0 1 1 1 1 0 0 0 0
## rino 0 0 0 0 1 0 0 0 1
## rio 0 0 1 1 0 1 0 0 0
## ryo 0 0 0 0 1 1 0 0 0
## saki 0 1 0 1 0 0 2 0 0
## sakura 0 1 1 1 1 1 1 1 1
## sena 0 0 1 1 1 0 0 0 0
## sera 0 1 1 1 1 2 1 0 1
## shimpei 0 1 0 2 1 0 1 2 0
## shine 0 0 1 0 0 0 0 0 0
## shintaro 0 0 2 1 1 1 1 0 0
## shun 0 0 0 0 0 0 0 1 1
## sit 0 0 0 0 0 1 0 0 0
## taichi 0 0 1 1 1 1 0 1 0
## tmk 0 0 1 0 0 3 0 1 0
## tomo 0 0 1 0 0 0 0 0 0
## tomoya 0 0 0 0 1 2 0 1 0
## yuji 0 0 1 0 0 2 0 0 0
## yuki 0 1 1 1 1 1 1 0 0
## yumi 0 1 1 1 1 1 1 0 2
## yuna 0 1 1 1 1 1 1 1 1
## yuto_s 0 0 1 1 1 1 1 2 0
## yuto_y 0 0 1 1 1 1 1 2 0
## yuuka 0 1 1 1 1 1 0 0 0
library(ggplot2)
heatmap <- as.data.frame(table(reading$Week, reading$Nickname))
figure1 <- ggplot(aes(x=Var1, y=Var2, fill = Freq), data=heatmap) +
geom_tile() +
ggtitle("Homework Assignments Per Student Per Week") +
scale_fill_gradient(low="white", high="purple") +
ylab("Students") +
xlab("Week") +
theme(axis.text.y=element_blank(),
axis.ticks.y = element_blank())
figure1
round(tapply(reading$opinionTokens, reading$Week, mean))
## W1 W2 W3 W4 W5 W6 W7 W8 W9
## 54 46 40 36 39 34 37 39 64
ggplot(aes(x = Week, y = opinionTokens), data = reading) +
geom_point() +
ggtitle("Opinion Word Counts per Student per Week") +
ylab("Opinion Word Counts")
mean(reading$summaryTokens)
## [1] 164.0099
mean(reading$opinionTokens)
## [1] 40.35314
library(ggplot2)
ggplot() +
geom_histogram(aes(x=opinionTokens), data = reading, binwidth=10, fill = "red") +
geom_histogram(aes(x=summaryTokens), data = reading, binwidth=10, fill = "green") +
scale_x_continuous(breaks = seq(0,1200,100)) +
xlab("Word Count Per Student") +
ylab("Number of Assignments") +
ggtitle("Opinion and Summary Word Counts")
t.test(reading$summaryTokens, reading$OpinionTokens)
##
## One Sample t-test
##
## data: reading$summaryTokens
## t = 27.53, df = 302, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 152.2867 175.7332
## sample estimates:
## mean of x
## 164.0099
library(effsize)
cohen.d(reading$opinionTokens, reading$summaryTokens, conf.level=0.95)
##
## Cohen's d
##
## d estimate: -1.610531 (large)
## 95 percent confidence interval:
## inf sup
## -1.794140 -1.426922
tapply(reading$totalTokens, reading$Campus, mean)
## Saitama Tokyo
## 225.8426 164.4434
tapply(reading$totalTokens, reading$Campus, sd)
## Saitama Tokyo
## 131.86775 62.39705
ggplot(aes(x=totalTokens), data=reading) +
geom_histogram(aes(fill=Campus), binwidth=10) +
scale_x_continuous(breaks = seq(0,1500,100)) +
xlab("Word Count Per Student") +
ylab("Number of Assignments") +
ggtitle("Homework Assignment Word Count per Campus")
tapply(reading$totalTokens, reading$Campus, median)
## Saitama Tokyo
## 206 158
t.test(Saitama$totalTokens, Tokyo$totalTokens)
##
## Welch Two Sample t-test
##
## data: Saitama$totalTokens and Tokyo$totalTokens
## t = 5.4917, df = 297.04, p-value = 8.554e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 39.39656 83.40193
## sample estimates:
## mean of x mean of y
## 225.8426 164.4434
cohen.d(Saitama$totalTokens, Tokyo$totalTokens, conf.level=0.95)
##
## Cohen's d
##
## d estimate: 0.5452318 (medium)
## 95 percent confidence interval:
## inf sup
## 0.3042118 0.7862518
tapply(reading$totalTokens, reading$Gender, mean)
## Female Male
## 227.4277 173.6692
tapply(reading$totalTokens, reading$Gender, sd)
## Female Male
## 138.91252 64.89961
ggplot(aes(x=totalTokens), data=reading) +
geom_histogram(aes(fill=Gender), binwidth=10) +
scale_x_continuous(breaks = seq(0,1500,100)) +
xlab("Word Count Per Student") +
ylab("Number of Assignments") +
ggtitle("Homework Assignment Word Count per Gender")
tapply(reading$totalTokens, reading$Gender, median)
## Female Male
## 203.0 167.5
females <- subset(reading, Gender=="Female")
males <- reading[reading$Gender!="Female",]
t.test(females$totalTokens, males$totalTokens)
##
## Welch Two Sample t-test
##
## data: females$totalTokens and males$totalTokens
## t = 4.4808, df = 257.47, p-value = 1.119e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 30.13269 77.38434
## sample estimates:
## mean of x mean of y
## 227.4277 173.6692
cohen.d(females$totalTokens, males$totalTokens, conf.level=0.95)
##
## Cohen's d
##
## d estimate: 0.4745734 (small)
## 95 percent confidence interval:
## inf sup
## 0.2430295 0.7061173
table(reading$Campus)
##
## Saitama Tokyo
## 197 106
reading %>%
group_by(Campus) %>%
summarize(count = n_distinct(Nickname))
## # A tibble: 2 x 2
## Campus count
## <fct> <int>
## 1 Saitama 38
## 2 Tokyo 20
observed_postings_campus <- c(106, 197)
expected_postings_campus <- c(20, 38)
expected_probs_campus <- prop.table(expected_postings_campus)
chisq.test(observed_postings_campus, p=expected_probs_campus)
##
## Chi-squared test for given probabilities
##
## data: observed_postings_campus
## X-squared = 0.033629, df = 1, p-value = 0.8545
table(reading$Gender)
##
## Female Male
## 173 130
reading %>%
group_by(Gender) %>%
summarize(count = n_distinct(Nickname))
## # A tibble: 2 x 2
## Gender count
## <fct> <int>
## 1 Female 31
## 2 Male 27
observed_postings_gender <- c(173, 130)
expected_postings_gender <- c(31, 27)
expected_probs_gender <- prop.table(expected_postings_gender)
chisq.test(observed_postings_gender, p=expected_probs_gender)
##
## Chi-squared test for given probabilities
##
## data: observed_postings_gender
## X-squared = 1.6201, df = 1, p-value = 0.2031
reading$Stars <- as.numeric(reading$Stars)
borrowed_often <- select(reading, Book_title, Stars, Nickname) %>%
group_by(Book_title) %>%
summarize(count = n(), average = mean(Stars)) %>%
arrange(desc(count), desc(average)) %>%
head(50)
borrowed_often
## # A tibble: 50 x 3
## Book_title count average
## <fct> <int> <dbl>
## 1 marco 7 2.29
## 2 the house on the hill 6 2.67
## 3 barack obama 4 3.5
## 4 the jungle book 4 3.5
## 5 the piano 4 3.5
## 6 the long tunnel 4 3.25
## 7 dangerous journey 4 2.75
## 8 superbird 4 2.75
## 9 the black cat and other stories 3 4.33
## 10 american life 3 4
## # ... with 40 more rows
popular_genres <- select(reading, Genre) %>%
group_by(Genre) %>%
summarize(count = n()) %>%
arrange(desc(count)) %>%
head(10)
popular_genres
## # A tibble: 10 x 2
## Genre count
## <fct> <int>
## 1 fantasy 49
## 2 other 47
## 3 action adventure 43
## 4 mystery 41
## 5 non-fiction 26
## 6 romance 24
## 7 children's literature 21
## 8 historical fiction 14
## 9 biography 10
## 10 classical literature 8
ggplot(data=reading, aes(x=reorder(Genre,Genre,
function(x)+length(x)))) +
geom_bar() +
theme(axis.text.x = element_text(size = 12, angle = 90, hjust=1),
axis.ticks.x = element_blank(),
axis.title.x = element_blank()) +
ggtitle("Genre Selection") +
ylab("Books Borrowed")
chisqReadingGenre <- chisq.test(table(reading$Genre))
chisqReadingGenre
##
## Chi-squared test for given probabilities
##
## data: table(reading$Genre)
## X-squared = 140.24, df = 12, p-value < 2.2e-16
sort(chisqReadingGenre$stdres)
##
## young adult sport classical literature
## -3.9469809 -3.5157981 -3.3002067
## science fiction biography historical fiction
## -3.3002067 -2.8690239 -2.0066583
## children's literature romance non-fiction
## -0.4975186 0.1492556 0.5804384
## mystery action adventure other
## 3.8143092 4.2454920 5.1078576
## fantasy
## 5.5390404
top_publishers <- select(reading, Publisher) %>%
group_by(Publisher) %>%
summarize(count = n()) %>%
arrange(desc(count)) %>%
head(6)
top_publishers
## # A tibble: 6 x 2
## Publisher count
## <fct> <int>
## 1 Macmillan 94
## 2 Penguin 91
## 3 Oxford 46
## 4 Cambridge 34
## 5 Pearson 32
## 6 Other 6
table(reading$Stars)
##
## 1 2 3 4 5
## 38 75 81 73 36
mean(reading$Stars)
## [1] 2.980198
reading$Stars <- as.numeric(reading$Stars)
ggplot(aes(x=Genre, y=Stars), data=reading) +
geom_boxplot() +
theme(axis.text.x = element_text(angle = 90),
axis.ticks.x=element_blank()) +
ggtitle("Student Ratings")
reading$Stars <- as.numeric(reading$Stars)
aovStarRatings <- aov(Stars ~ Genre, data=reading)
summary(aovStarRatings)
## Df Sum Sq Mean Sq F value Pr(>F)
## Genre 12 20.5 1.712 1.173 0.302
## Residuals 290 423.3 1.460
top50 <- select(reading, Book_title, Stars, Nickname) %>%
arrange(desc(Stars)) %>%
head(50)
top50
## Book_title Stars Nickname
## 1 mother teresa 5 yuto_y
## 2 tales from hans andersen 5 yuto_y
## 3 american life 5 tomo
## 4 love or monet 5 tomoya
## 5 michael jordan 5 hide
## 6 american life 5 hide
## 7 the mummy returns 5 yuto_s
## 8 the phantom of the opera 5 miku
## 9 the black cat and other stories 5 moena
## 10 anne frank 5 moena
## 11 marley & me 5 genki
## 12 the adventure of huckleberry finn 5 nono
## 13 the piano 5 asahi
## 14 simply suspence 5 yumi
## 15 k's first case 5 natsu
## 16 the black tulip 5 natsu
## 17 the merchant of venice 5 chacha
## 18 the canterville ghost 5 sakura
## 19 the return of sherlock holmes 5 sakura
## 20 jim smiley and his jumping frog and other stories 5 aya
## 21 babe 5 aya
## 22 sherlock holmes and the mystery of boscombe pool 5 aya
## 23 washington square 5 kazuma
## 24 five famous faury tales 5 miki
## 25 anna and the fighter 5 miki
## 26 pirates of the caribbean. 5 mayu
## 27 dracula 5 kasumi
## 28 marcel and the mona lisa 5 kensuke
## 29 rich man,poor man 5 kensuke
## 30 superbird 5 minako
## 31 wuthering heights 5 minako
## 32 billy budd 5 minami
## 33 the trumpet-major 5 minami
## 34 michael jordan 5 mike
## 35 surfer! 5 mike
## 36 the wizard of oz 5 kandai
## 37 johnny english 4 yuto_y
## 38 the long tunnel 4 yuto_y
## 39 marcel goes to hollywood 4 yuto_y
## 40 leonardo da vinci 4 tomoya
## 41 pirates of the caribbean: at world's end 4 tomoya
## 42 jojo's story 4 hide
## 43 new york 4 hide
## 44 hamlet 4 mai
## 45 just like a movie 4 moeka
## 46 one-way ticket short stories 4 yuto_s
## 47 the truth machine 4 miku
## 48 the long tunnel 4 miku
## 49 the adventures of tom sawyer 4 miku
## 50 next door to love 4 miku
table(reading$Genre, reading$Gender)
##
## Female Male
## action adventure 25 18
## biography 4 6
## children's literature 4 17
## classical literature 7 1
## fantasy 31 18
## historical fiction 8 6
## mystery 29 12
## non-fiction 14 12
## other 26 21
## romance 15 9
## science fiction 6 2
## sport 1 6
## young adult 3 2
ggplot(aes(x=Genre), data=reading) +
geom_bar(aes(fill=Genre)) +
facet_wrap(~Gender) +
ylab("Book Count") +
theme(axis.text.x = element_text(angle = 90),
axis.ticks.x=element_blank()) +
ggtitle("Genre Selection by Gender")
mean(table(reading$Genre, reading$Gender)>=5)
## [1] 0.7307692
table(reading$Genre, reading$Campus)
##
## Saitama Tokyo
## action adventure 27 16
## biography 4 6
## children's literature 14 7
## classical literature 6 2
## fantasy 31 18
## historical fiction 10 4
## mystery 24 17
## non-fiction 19 7
## other 35 12
## romance 17 7
## science fiction 5 3
## sport 2 5
## young adult 3 2
ggplot(aes(x=Genre), data=reading) +
geom_bar(aes(fill=Genre)) +
facet_wrap(~Campus) +
ylab("Book Count") +
theme(axis.text.x = element_text(angle = 90),
axis.ticks.x=element_blank()) +
ggtitle("Genre Selection by Campus")
mean(table(reading$Genre, reading$Campus)>=5)
## [1] 0.7307692
levels(reading$Nickname)
## [1] "alice" "asahi" "atsuhito" "atsushi" "aya" "ayumi"
## [7] "chacha" "eishi" "fumiya" "genki" "harumi" "hide"
## [13] "jun" "kakuto" "kandai" "kasumi" "kazuma" "kensuke"
## [19] "koharu" "kouki" "mai" "mako" "manabu" "mayu"
## [25] "mike" "miki" "miku" "minako" "minami" "mirei"
## [31] "moeka" "moena" "mri" "natsu" "nono" "rino"
## [37] "rio" "ryo" "saki" "sakura" "sena" "sera"
## [43] "shimpei" "shine" "shintaro" "shun" "sit" "taichi"
## [49] "tmk" "tomo" "tomoya" "yuji" "yuki" "yumi"
## [55] "yuna" "yuto_s" "yuto_y" "yuuka"
Tokyo %>%
group_by(Gender) %>%
summarize(counts = n_distinct(Nickname)/nlevels(Tokyo$Nickname))
## # A tibble: 2 x 2
## Gender counts
## <fct> <dbl>
## 1 Female 0.3
## 2 Male 0.7
SaitamaPeriod3 %>%
group_by(Gender) %>%
summarize(counts = n_distinct(Nickname)/nlevels(SaitamaPeriod3$Nickname))
## # A tibble: 2 x 2
## Gender counts
## <fct> <dbl>
## 1 Female 0.8
## 2 Male 0.2
SaitamaPeriod4 %>%
group_by(Gender) %>%
summarize(counts = n_distinct(Nickname)/nlevels(SaitamaPeriod4$Nickname))
## # A tibble: 2 x 2
## Gender counts
## <fct> <dbl>
## 1 Female 0.5
## 2 Male 0.5
TClusterDf <- cbind(Ttitle, Tauthor, Tpublisher, Tgenre, Tstars)
library(ggdendro)
figure3 <- ggdendrogram(hclust(dist(TClusterDf)))
figure3
filter(reading, Nickname == "mai" | Nickname == "rio") %>%
group_by(Nickname) %>%
select(Nickname, Book_title, Author, Publisher, Genre, Stars)
## # A tibble: 6 x 6
## # Groups: Nickname [2]
## Nickname Book_title Author Publisher Genre Stars
## <fct> <fct> <fct> <fct> <fct> <dbl>
## 1 mai a midsummmer night’s dream william s… Pearson fantasy 3
## 2 mai strong medicine richard m… Cambridge mystery 3
## 3 mai hamlet william s… Pearson classica… 4
## 4 rio strong medicine richard … Cambridge mystery 3
## 5 rio a midsummer night's dream william s… Pearson fantasy 3
## 6 rio a chrismas carol charis di… Oxford fantasy 3
SaitamaPeriod3title <- table(SaitamaPeriod3$Nickname, SaitamaPeriod3$Book_title)
SaitamaPeriod3author <- table(SaitamaPeriod3$Nickname, SaitamaPeriod3$Author)
SaitamaPeriod3publisher <- table(SaitamaPeriod3$Nickname, SaitamaPeriod3$Publisher)
SaitamaPeriod3genre <- table(SaitamaPeriod3$Nickname, SaitamaPeriod3$Genre)
SaitamaPeriod3stars <- table(SaitamaPeriod3$Nickname, SaitamaPeriod3$Stars)
SaitamaPeriod3ClusterDf <- cbind(SaitamaPeriod3title, SaitamaPeriod3author, SaitamaPeriod3publisher, SaitamaPeriod3genre, SaitamaPeriod3stars)
ggdendrogram(hclust(dist(SaitamaPeriod3ClusterDf)))
SaitamaPeriod4title <- table(SaitamaPeriod4$Nickname, SaitamaPeriod4$Author)
SaitamaPeriod4author <- table(SaitamaPeriod4$Nickname, SaitamaPeriod4$Author)
SaitamaPeriod4publisher <- table(SaitamaPeriod4$Nickname, SaitamaPeriod4$Publisher)
SaitamaPeriod4genre <- table(SaitamaPeriod4$Nickname, SaitamaPeriod4$Genre)
SaitamaPeriod4stars <- table(SaitamaPeriod4$Nickname, SaitamaPeriod4$Stars)
SaitamaPeriod4ClusterDf <- cbind(SaitamaPeriod4title, SaitamaPeriod4author, SaitamaPeriod4publisher, SaitamaPeriod4genre, SaitamaPeriod4stars)
ggdendrogram(hclust(dist(SaitamaPeriod4ClusterDf)))
summary(reading$Teacher_Assessment)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.000 3.000 2.851 3.000 4.000
scores <- select(reading, Nickname, Teacher_Assessment) %>%
group_by(Nickname) %>%
summarize(Total = sum(Teacher_Assessment))
scores <- as.data.frame(scores)
summary(scores$Total)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 9.0 15.0 14.9 20.0 31.0
15/nlevels(reading$Week)*summary(scores$Total)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 15.00 25.00 24.83 33.33 51.67
averageNpostings <- nrow(reading)/nlevels(reading$Nickname)
averageNpostings
## [1] 5.224138
tapply(reading$Teacher_Assessment, reading$Nickname, sum)
## alice asahi atsuhito atsushi aya ayumi chacha eishi
## 10 15 19 2 28 16 0 14
## fumiya genki harumi hide jun kakuto kandai kasumi
## 14 22 29 25 19 15 8 9
## kazuma kensuke koharu kouki mai mako manabu mayu
## 25 12 15 20 9 18 9 22
## mike miki miku minako minami mirei moeka moena
## 22 17 24 15 15 15 15 29
## mri natsu nono rino rio ryo saki sakura
## 7 25 12 6 7 3 11 27
## sena sera shimpei shine shintaro shun sit taichi
## 5 20 20 3 19 7 4 12
## tmk tomo tomoya yuji yuki yumi yuna yuto_s
## 13 3 9 9 19 31 20 16
## yuto_y yuuka
## 16 13
tapply(reading$Teacher_Assessment, reading$Nickname, mean)
## alice asahi atsuhito atsushi aya ayumi chacha eishi
## 2.000000 3.750000 2.714286 2.000000 3.500000 3.200000 0.000000 3.500000
## fumiya genki harumi hide jun kakuto kandai kasumi
## 2.333333 2.750000 3.222222 3.125000 3.166667 2.142857 2.000000 3.000000
## kazuma kensuke koharu kouki mai mako manabu mayu
## 3.571429 2.400000 3.000000 2.857143 3.000000 3.000000 2.250000 2.750000
## mike miki miku minako minami mirei moeka moena
## 3.142857 2.428571 3.428571 3.000000 3.000000 2.500000 2.500000 3.625000
## mri natsu nono rino rio ryo saki sakura
## 2.333333 3.571429 3.000000 3.000000 2.333333 1.500000 2.750000 3.375000
## sena sera shimpei shine shintaro shun sit taichi
## 1.666667 2.500000 2.857143 3.000000 3.166667 3.500000 4.000000 2.400000
## tmk tomo tomoya yuji yuki yumi yuna yuto_s
## 2.600000 3.000000 2.250000 3.000000 3.166667 3.875000 2.500000 2.285714
## yuto_y yuuka
## 2.285714 2.600000
tapply(reading$Teacher_Assessment, reading$Nickname, sd)
## alice asahi atsuhito atsushi aya ayumi chacha
## 0.0000000 0.5000000 0.4879500 NA 0.5345225 0.4472136 0.0000000
## eishi fumiya genki harumi hide jun kakuto
## 0.5773503 0.5163978 0.4629100 0.8333333 1.3562027 0.4082483 0.3779645
## kandai kasumi kazuma kensuke koharu kouki mai
## 0.0000000 1.0000000 0.5345225 0.5477226 0.7071068 0.3779645 0.0000000
## mako manabu mayu mike miki miku minako
## 0.6324555 0.5000000 0.4629100 0.8997354 0.5345225 0.5345225 0.7071068
## minami mirei moeka moena mri natsu nono
## 0.0000000 0.5477226 0.5477226 0.5175492 0.5773503 0.5345225 0.0000000
## rino rio ryo saki sakura sena sera
## 0.0000000 0.5773503 0.7071068 0.5000000 0.5175492 0.5773503 0.5345225
## shimpei shine shintaro shun sit taichi tmk
## 0.6900656 NA 0.4082483 0.7071068 NA 0.8944272 0.5477226
## tomo tomoya yuji yuki yumi yuna yuto_s
## NA 0.5000000 0.0000000 0.7527727 0.3535534 0.5345225 0.4879500
## yuto_y yuuka
## 1.2535663 0.5477226
table(reading$Teacher_Assessment)
##
## 0 1 2 3 4
## 4 3 86 151 59
round(prop.table(table(reading$Teacher_Assessment))*100)
##
## 0 1 2 3 4
## 1 1 28 50 19
ggplot(aes(x=Teacher_Assessment), data = reading) +
geom_histogram(binwidth=1) +
ggtitle("Homework Assignment Scores") +
ylab("Count") +
xlab("score")
vars <- c("summaryTokens", "opinionTokens", "totalTokens",
"summarySentenceCount", "opinionSentenceCount",
"summaryTypes", "opinionTypes",
"totalSentenceCount", "totalTypes",
"summaryTTR", "opinionTTR")
cor(reading$Teacher_Assessment, reading[,vars])
## summaryTokens opinionTokens totalTokens summarySentenceCount
## [1,] 0.2971513 0.3969586 0.3752206 0.2486507
## opinionSentenceCount summaryTypes opinionTypes totalSentenceCount
## [1,] 0.2771862 0.3226383 0.4108225 0.2940118
## totalTypes summaryTTR opinionTTR
## [1,] 0.424951 -0.3577886 -0.2786896
cor.test(reading$Teacher_Assessment, reading$totalTypes)
##
## Pearson's product-moment correlation
##
## data: reading$Teacher_Assessment and reading$totalTypes
## t = 8.1446, df = 301, p-value = 1.024e-14
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3279774 0.5130623
## sample estimates:
## cor
## 0.424951
cor.test(reading$Teacher_Assessment, reading$summarySentenceCount)
##
## Pearson's product-moment correlation
##
## data: reading$Teacher_Assessment and reading$summarySentenceCount
## t = 4.4538, df = 301, p-value = 1.191e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1398921 0.3514812
## sample estimates:
## cor
## 0.2486507
ggplot(aes(x=totalTypes, y=Teacher_Assessment), data = reading) +
geom_point() +
stat_smooth(method="lm") +
ggtitle("Correlation of Total Types and\nTeacher Assessment") +
ylab("Teacher Assessment") +
xlab("Total Types")
linMod1 <- lm(Teacher_Assessment ~ summaryTokens + opinionTokens +
summarySentenceCount + opinionSentenceCount +
summaryTypes + opinionTypes +
summaryTTR + opinionTTR, data = reading)
summary(linMod1)
##
## Call:
## lm(formula = Teacher_Assessment ~ summaryTokens + opinionTokens +
## summarySentenceCount + opinionSentenceCount + summaryTypes +
## opinionTypes + summaryTTR + opinionTTR, data = reading)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.00843 -0.43792 0.08181 0.40457 1.52557
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.862386 0.600814 6.429 5.19e-10 ***
## summaryTokens -0.006297 0.002094 -3.007 0.002862 **
## opinionTokens 0.003424 0.008455 0.405 0.685791
## summarySentenceCount -0.014484 0.011687 -1.239 0.216234
## opinionSentenceCount -0.036111 0.037582 -0.961 0.337414
## summaryTypes 0.018401 0.004956 3.713 0.000245 ***
## opinionTypes 0.011665 0.012831 0.909 0.364039
## summaryTTR -3.283157 0.623276 -5.268 2.68e-07 ***
## opinionTTR -0.003850 0.482982 -0.008 0.993645
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6645 on 294 degrees of freedom
## Multiple R-squared: 0.3033, Adjusted R-squared: 0.2843
## F-statistic: 16 on 8 and 294 DF, p-value: < 2.2e-16
linMod2 <- step(linMod1)
## Start: AIC=-238.85
## Teacher_Assessment ~ summaryTokens + opinionTokens + summarySentenceCount +
## opinionSentenceCount + summaryTypes + opinionTypes + summaryTTR +
## opinionTTR
##
## Df Sum of Sq RSS AIC
## - opinionTTR 1 0.0000 129.81 -240.85
## - opinionTokens 1 0.0724 129.88 -240.68
## - opinionTypes 1 0.3649 130.17 -240.00
## - opinionSentenceCount 1 0.4076 130.22 -239.90
## - summarySentenceCount 1 0.6781 130.49 -239.27
## <none> 129.81 -238.85
## - summaryTokens 1 3.9936 133.80 -231.66
## - summaryTypes 1 6.0854 135.89 -226.96
## - summaryTTR 1 12.2511 142.06 -213.52
##
## Step: AIC=-240.85
## Teacher_Assessment ~ summaryTokens + opinionTokens + summarySentenceCount +
## opinionSentenceCount + summaryTypes + opinionTypes + summaryTTR
##
## Df Sum of Sq RSS AIC
## - opinionTokens 1 0.0779 129.89 -242.66
## - opinionTypes 1 0.3732 130.18 -241.98
## - opinionSentenceCount 1 0.4319 130.24 -241.84
## - summarySentenceCount 1 0.6782 130.49 -241.27
## <none> 129.81 -240.85
## - summaryTokens 1 3.9937 133.80 -233.66
## - summaryTypes 1 6.0855 135.89 -228.96
## - summaryTTR 1 12.2825 142.09 -215.45
##
## Step: AIC=-242.66
## Teacher_Assessment ~ summaryTokens + summarySentenceCount + opinionSentenceCount +
## summaryTypes + opinionTypes + summaryTTR
##
## Df Sum of Sq RSS AIC
## - opinionSentenceCount 1 0.3563 130.24 -243.83
## - summarySentenceCount 1 0.6568 130.54 -243.14
## <none> 129.89 -242.66
## - summaryTokens 1 3.9406 133.83 -235.61
## - summaryTypes 1 6.0119 135.90 -230.95
## - opinionTypes 1 8.5507 138.44 -225.35
## - summaryTTR 1 12.2789 142.16 -217.29
##
## Step: AIC=-243.83
## Teacher_Assessment ~ summaryTokens + summarySentenceCount + summaryTypes +
## opinionTypes + summaryTTR
##
## Df Sum of Sq RSS AIC
## <none> 130.24 -243.83
## - summarySentenceCount 1 1.3498 131.59 -242.71
## - summaryTokens 1 4.0704 134.31 -236.51
## - summaryTypes 1 6.7127 136.96 -230.61
## - summaryTTR 1 13.3064 143.55 -216.36
## - opinionTypes 1 18.9822 149.22 -204.61
summary(linMod2)
##
## Call:
## lm(formula = Teacher_Assessment ~ summaryTokens + summarySentenceCount +
## summaryTypes + opinionTypes + summaryTTR, data = reading)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.03991 -0.43249 0.08115 0.39076 1.55170
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.876827 0.421775 9.192 < 2e-16 ***
## summaryTokens -0.006337 0.002080 -3.047 0.002522 **
## summarySentenceCount -0.018574 0.010587 -1.754 0.080384 .
## summaryTypes 0.018973 0.004849 3.912 0.000113 ***
## opinionTypes 0.013891 0.002111 6.579 2.13e-10 ***
## summaryTTR -3.372931 0.612314 -5.508 7.85e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6622 on 297 degrees of freedom
## Multiple R-squared: 0.301, Adjusted R-squared: 0.2892
## F-statistic: 25.57 on 5 and 297 DF, p-value: < 2.2e-16
tapply(reading$Teacher_Assessment, reading$Gender, mean)
## Female Male
## 2.959538 2.707692
tapply(reading$Teacher_Assessment, reading$Gender, sd)
## Female Male
## 0.7575848 0.8016084
ggplot(reading, aes(x = Teacher_Assessment, y = ..density.., colour=Gender)) +
stat_density(geom="line") +
ggtitle("Homework Assignment Scores per Gender") +
ylab("Density") +
xlab("Score")
Females <- subset(reading, reading$Gender=="Female")
Males <- reading[!reading$Gender=="Female",]
t.test(Females$Teacher_Assessment, Males$Teacher_Assessment)
##
## Welch Two Sample t-test
##
## data: Females$Teacher_Assessment and Males$Teacher_Assessment
## t = 2.771, df = 269.29, p-value = 0.005978
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.07290596 0.43078457
## sample estimates:
## mean of x mean of y
## 2.959538 2.707692
cohen.d(Females$Teacher_Assessment, Males$Teacher_Assessment, conf.level=0.95)
##
## Cohen's d
##
## d estimate: 0.3242263 (small)
## 95 percent confidence interval:
## inf sup
## 0.09434562 0.55410702
tapply(reading$Teacher_Assessment, reading$Campus, mean)
## Saitama Tokyo
## 2.949239 2.669811
tapply(reading$Teacher_Assessment, reading$Campus, sd)
## Saitama Tokyo
## 0.7609486 0.8014475
ggplot(reading, aes(x = Teacher_Assessment, y = ..density.., colour = Campus)) +
stat_density(geom="line") +
ggtitle("Homework Assignment Scores per Campus") +
ylab("Density") +
xlab("Score")
t.test(Tokyo$Teacher_Assessment, Saitama$Teacher_Assessment)
##
## Welch Two Sample t-test
##
## data: Tokyo$Teacher_Assessment and Saitama$Teacher_Assessment
## t = -2.9456, df = 205.65, p-value = 0.003595
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.46645514 -0.09239937
## sample estimates:
## mean of x mean of y
## 2.669811 2.949239
cohen.d(Saitama$Teacher_Assessment, Tokyo$Teacher_Assessment, conf.level=0.95)
##
## Cohen's d
##
## d estimate: 0.3604042 (small)
## 95 percent confidence interval:
## inf sup
## 0.1216135 0.5991948
ggplot(data=reading) +
aes(x=Gender, y=Teacher_Assessment, group=Campus, color=Campus) +
stat_summary(fun.y = mean, geom = "line") +
stat_summary(fun.y = mean, geom = "point") +
ggtitle("Effect of Gender and Campus \non Homework Assignment Scores") +
ylab("Score")
genderCampusAov <- aov(Teacher_Assessment~Gender*Campus,data= reading)
summary(genderCampusAov)
## Df Sum Sq Mean Sq F value Pr(>F)
## Gender 1 4.71 4.708 7.912 0.00523 **
## Campus 1 2.38 2.375 3.992 0.04662 *
## Gender:Campus 1 1.33 1.334 2.242 0.13536
## Residuals 299 177.90 0.595
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(genderCampusAov)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = Teacher_Assessment ~ Gender * Campus, data = reading)
##
## $Gender
## diff lwr upr p adj
## Male-Female -0.2518453 -0.4280382 -0.07565234 0.0052347
##
## $Campus
## diff lwr upr p adj
## Tokyo-Saitama -0.1678927 -0.3507436 0.01495823 0.0717765
##
## $`Gender:Campus`
## diff lwr upr p adj
## Male:Saitama-Female:Saitama -0.05762756 -0.3759265 0.26067135 0.9660559
## Female:Tokyo-Female:Saitama -0.03170163 -0.4318871 0.36848386 0.9969618
## Male:Tokyo-Female:Saitama -0.39924549 -0.6821333 -0.11635771 0.0017758
## Female:Tokyo-Male:Saitama 0.02592593 -0.4278580 0.47970982 0.9988516
## Male:Tokyo-Male:Saitama -0.34161793 -0.6962965 0.01306063 0.0637710
## Male:Tokyo-Female:Tokyo -0.36754386 -0.7972311 0.06214337 0.1228775
tapply(reading$Teacher_Assessment, reading$Month, mean)
## April June May
## 2.883117 2.975000 2.811828
monthAov <- aov(Teacher_Assessment~Month, data=reading)
summary(monthAov)
## Df Sum Sq Mean Sq F value Pr(>F)
## Month 2 0.98 0.4899 0.793 0.453
## Residuals 300 185.34 0.6178
table(reading$Week)
##
## W1 W2 W3 W4 W5 W6 W7 W8 W9
## 1 25 46 43 44 48 38 34 24
tapply(reading$Teacher_Assessment, reading$Week, mean)
## W1 W2 W3 W4 W5 W6 W7 W8
## 4.000000 2.800000 3.000000 2.790698 3.068182 2.875000 2.552632 2.529412
## W9
## 3.166667
figure4 <- ggplot(data=reading) +
aes(x=Week, y=Teacher_Assessment, group=Campus, color=Campus) +
stat_summary(fun.y = mean, geom = "line") +
stat_summary(fun.y = mean, geom = "point") +
ylab("Average Homework Score") +
ggtitle("Weekly Homework Assignment Scores")
figure4
weekAov <- aov(Teacher_Assessment~Week,data= reading)
summary(weekAov)
## Df Sum Sq Mean Sq F value Pr(>F)
## Week 8 13.96 1.7446 2.976 0.00321 **
## Residuals 294 172.36 0.5863
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(weekAov)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = Teacher_Assessment ~ Week, data = reading)
##
## $Week
## diff lwr upr p adj
## W2-W1 -1.200000000 -3.6401769793 1.240176979 0.8372638
## W3-W1 -1.000000000 -3.4186591472 1.418659147 0.9331055
## W4-W1 -1.209302326 -3.6297559524 1.211151301 0.8248438
## W5-W1 -0.931818182 -3.3516466108 1.488010247 0.9553290
## W6-W1 -1.125000000 -3.5425868088 1.292586809 0.8753962
## W7-W1 -1.447368421 -3.8714384456 0.976701604 0.6377601
## W8-W1 -1.470588235 -3.8983117209 0.957135250 0.6191399
## W9-W1 -0.833333333 -3.2754647999 1.608798133 0.9784818
## W3-W2 0.200000000 -0.3945452451 0.794545245 0.9803262
## W4-W2 -0.009302326 -0.6111060626 0.592501411 1.0000000
## W5-W2 0.268181818 -0.3311024246 0.867466061 0.8978111
## W6-W2 0.075000000 -0.5151677357 0.665167736 0.9999826
## W7-W2 -0.247368421 -0.8635562674 0.368819425 0.9432685
## W8-W2 -0.270588235 -0.9009954851 0.359819015 0.9179648
## W9-W2 0.366666667 -0.3171301440 1.050463477 0.7609223
## W4-W3 -0.209302326 -0.7168613053 0.298256654 0.9340688
## W5-W3 0.068181818 -0.4363872888 0.572750925 0.9999720
## W6-W3 -0.125000000 -0.6187067310 0.368706731 0.9970483
## W7-W3 -0.447368421 -0.9719023495 0.077165507 0.1650380
## W8-W3 -0.470588235 -1.0117551972 0.070578727 0.1461947
## W9-W3 0.166666667 -0.4358501035 0.769183437 0.9945590
## W5-W4 0.277484144 -0.2356178667 0.790586154 0.7523004
## W6-W4 0.084302326 -0.4181218236 0.586726475 0.9998541
## W7-W4 -0.238066095 -0.7708132507 0.294681060 0.8985631
## W8-W4 -0.261285910 -0.8104173785 0.287845559 0.8609723
## W9-W4 0.375968992 -0.2337113729 0.985649357 0.5954857
## W6-W5 -0.193181818 -0.6925853539 0.306221718 0.9541366
## W7-W5 -0.515550239 -1.0454496611 0.014349183 0.0635916
## W8-W5 -0.538770053 -1.0851391923 0.007599085 0.0566360
## W9-W5 0.098484848 -0.5087087067 0.705678404 0.9998871
## W7-W6 -0.322368421 -0.8419353078 0.197198466 0.5872376
## W8-W6 -0.345588235 -0.8819422120 0.190765741 0.5351432
## W9-W6 0.291666667 -0.3065309311 0.889864264 0.8436950
## W8-W7 -0.023219814 -0.5880782869 0.541638658 1.0000000
## W9-W7 0.614035088 -0.0098478114 1.237917987 0.0575859
## W9-W8 0.637254902 -0.0006759018 1.275185706 0.0504811
sort(table(reading$Day))
##
## Wednesday Sunday Saturday Tuesday Monday Thursday Friday
## 17 19 20 40 48 74 85
sort(tapply(reading$Teacher_Assessment, reading$Day, mean))
## Monday Tuesday Sunday Friday Wednesday Thursday Saturday
## 2.562500 2.725000 2.842105 2.858824 2.882353 3.000000 3.200000
dayAov <- aov(Teacher_Assessment~Day,data= reading)
summary(weekAov)
## Df Sum Sq Mean Sq F value Pr(>F)
## Week 8 13.96 1.7446 2.976 0.00321 **
## Residuals 294 172.36 0.5863
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(dayAov)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = Teacher_Assessment ~ Day, data = reading)
##
## $Day
## diff lwr upr p adj
## Monday-Friday -0.29632353 -0.71149222 0.1188452 0.3442355
## Saturday-Friday 0.34117647 -0.23030091 0.9126538 0.5678170
## Sunday-Friday -0.01671827 -0.60024299 0.5668065 1.0000000
## Thursday-Friday 0.14117647 -0.22442008 0.5067730 0.9130497
## Tuesday-Friday -0.13382353 -0.57472790 0.3070808 0.9722053
## Wednesday-Friday 0.02352941 -0.58740560 0.6344644 0.9999998
## Saturday-Monday 0.63750000 0.02550525 1.2494947 0.0350019
## Sunday-Monday 0.27960526 -0.34365413 0.9028647 0.8365100
## Thursday-Monday 0.43750000 0.01134039 0.8636596 0.0398998
## Tuesday-Monday 0.16250000 -0.32978837 0.6547884 0.9580168
## Wednesday-Monday 0.31985294 -0.32914072 0.9688466 0.7664097
## Sunday-Saturday -0.35789474 -1.09455859 0.3787691 0.7782968
## Thursday-Saturday -0.20000000 -0.77951132 0.3795113 0.9480657
## Tuesday-Saturday -0.47500000 -1.10473740 0.1547374 0.2778416
## Wednesday-Saturday -0.31764706 -1.07620762 0.4409135 0.8766020
## Thursday-Sunday 0.15789474 -0.43350029 0.7492898 0.9855666
## Tuesday-Sunday -0.11710526 -0.75779544 0.5235849 0.9981639
## Wednesday-Sunday 0.04024768 -0.72742988 0.8079252 0.9999988
## Tuesday-Thursday -0.27500000 -0.72626892 0.1762689 0.5429382
## Wednesday-Thursday -0.11764706 -0.73610366 0.5008095 0.9977032
## Wednesday-Tuesday 0.15735294 -0.50839806 0.8231039 0.9924327
scores <- select(reading, Nickname, Campus, Teacher_Assessment,
summaryTokens, opinionTokens) %>%
group_by(Nickname) %>%
summarize(books = length(Teacher_Assessment),
score = sum(Teacher_Assessment),
average = mean(Teacher_Assessment),
percent=average*25,
summary_Words = round(mean(summaryTokens)),
opinion_Words = round(mean(opinionTokens)))
print(tbl_df(scores), n=nlevels(reading$Nickname))
## # A tibble: 58 x 7
## Nickname books score average percent summary_Words opinion_Words
## <fct> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 alice 5 10 2 50 187 34
## 2 asahi 4 15 3.75 93.8 180 36
## 3 atsuhito 7 19 2.71 67.9 129 21
## 4 atsushi 1 2 2 50 146 1
## 5 aya 8 28 3.5 87.5 203 50
## 6 ayumi 5 16 3.2 80 256 37
## 7 chacha 2 0 0 0 160 29
## 8 eishi 4 14 3.5 87.5 141 67
## 9 fumiya 6 14 2.33 58.3 88 70
## 10 genki 8 22 2.75 68.8 179 24
## 11 harumi 9 29 3.22 80.6 444 53
## 12 hide 8 25 3.12 78.1 185 60
## 13 jun 6 19 3.17 79.2 187 35
## 14 kakuto 7 15 2.14 53.6 88 18
## 15 kandai 4 8 2 50 79 35
## 16 kasumi 3 9 3 75 190 51
## 17 kazuma 7 25 3.57 89.3 219 62
## 18 kensuke 5 12 2.4 60 104 20
## 19 koharu 5 15 3 75 136 19
## 20 kouki 7 20 2.86 71.4 187 13
## 21 mai 3 9 3 75 135 15
## 22 mako 6 18 3 75 138 33
## 23 manabu 4 9 2.25 56.2 65 38
## 24 mayu 8 22 2.75 68.8 111 24
## 25 mike 7 22 3.14 78.6 155 44
## 26 miki 7 17 2.43 60.7 140 34
## 27 miku 7 24 3.43 85.7 203 53
## 28 minako 5 15 3 75 179 43
## 29 minami 5 15 3 75 208 41
## 30 mirei 6 15 2.5 62.5 147 21
## 31 moeka 6 15 2.5 62.5 153 11
## 32 moena 8 29 3.62 90.6 203 76
## 33 mri 3 7 2.33 58.3 147 17
## 34 natsu 7 25 3.57 89.3 255 158
## 35 nono 4 12 3 75 227 19
## 36 rino 2 6 3 75 160 74
## 37 rio 3 7 2.33 58.3 76 19
## 38 ryo 2 3 1.5 37.5 54 6
## 39 saki 4 11 2.75 68.8 153 27
## 40 sakura 8 27 3.38 84.4 194 31
## 41 sena 3 5 1.67 41.7 34 6
## 42 sera 8 20 2.5 62.5 154 32
## 43 shimpei 7 20 2.86 71.4 119 6
## 44 shine 1 3 3 75 119 20
## 45 shintaro 6 19 3.17 79.2 181 27
## 46 shun 2 7 3.5 87.5 231 83
## 47 sit 1 4 4 100 262 59
## 48 taichi 5 12 2.4 60 62 69
## 49 tmk 5 13 2.6 65 184 22
## 50 tomo 1 3 3 75 48 72
## 51 tomoya 4 9 2.25 56.2 68 37
## 52 yuji 3 9 3 75 147 27
## 53 yuki 6 19 3.17 79.2 160 37
## 54 yumi 8 31 3.88 96.9 176 109
## 55 yuna 8 20 2.5 62.5 109 16
## 56 yuto_s 7 16 2.29 57.1 147 34
## 57 yuto_y 7 16 2.29 57.1 143 35
## 58 yuuka 5 13 2.6 65 110 42
COMMENT About 2 per cent of the sentences were duplicates.