Text Analysis - wordcloud

Graphic analysis of a text using the package wordcloud

library(NLP)
aFile = readLines("textanalysis.txt")
library(tm)
myCorpus = Corpus(VectorSource(aFile))
myCorpus = tm_map(myCorpus, tolower)
transformation drops documents
myCorpus = tm_map(myCorpus, removePunctuation)
transformation drops documents
myCorpus = tm_map(myCorpus, removeNumbers)
transformation drops documents
myCorpus = tm_map(myCorpus, removeWords, stopwords("english"))
transformation drops documents
myDTM = TermDocumentMatrix(myCorpus, control = list(minWordLength = 3))
m = as.matrix(myDTM)
v = sort(rowSums(m), decreasing = TRUE)
library(wordcloud)
set.seed(433)
wordcloud(names(v), v, scale=c(2,.1), min.freq = 20, 
          max.words = 50, colors=c("green","red","blue"),
          random.color = FALSE)

General Lineal Model - Logistic regression

The model is trained with historical data. With the model parameters you can make a “prediction” of the response variable with new data (predictors).

#training data
customer = read.csv('customer.csv', header=TRUE)
customer
str(customer)
'data.frame':   100 obs. of  5 variables:
 $ CustomerID : int  1 2 3 4 5 6 7 8 9 10 ...
 $ gender     : Factor w/ 2 levels "F","M": 1 2 1 1 2 2 1 1 2 2 ...
 $ age        : int  36 26 21 49 42 49 47 50 26 40 ...
 $ visit.times: int  5 3 2 5 4 1 4 1 2 3 ...
 $ buy        : Factor w/ 2 levels "no","yes": 2 1 2 2 1 1 2 1 1 1 ...
#fitting the model
logitfit = glm(buy ~ visit.times + age + gender, data=customer, family=binomial(logit))
glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logitfit)

Call:
glm(formula = buy ~ visit.times + age + gender, family = binomial(logit), 
    data = customer)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-1.909   0.000   0.000   0.000   1.245  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)
(Intercept)   26.5278    18.6925   1.419    0.156
visit.times    9.7809     6.1264   1.597    0.110
age           -1.1396     0.7592  -1.501    0.133
genderM      -71.0222  4170.8348  -0.017    0.986

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 133.7496  on 99  degrees of freedom
Residual deviance:   7.1936  on 96  degrees of freedom
AIC: 15.194

Number of Fisher Scoring iterations: 21
#predicting outcome with new data
CustomerID <- c(1,2,3)
gender <- c("F","F","M")
age <- c(36,50,21)
visit.times <- c(5,1,2)
buy <- c("","","")
custom2 <- data.frame(CustomerID,gender,age,visit.times,buy)
custom2
pr <- predict(logitfit, custom2, type="response")
pr
           1            2            3 
1.000000e+00 1.055095e-09 2.220446e-16 

Maps

From a world map you can select a country and cities to be plotted in a map.

library(maps)
library(mapdata)
library(ggplot2)
library(ggrepel)
cities = c("San Juan")
global <- map_data("world")
gg1 <- ggplot() + geom_polygon(data = global, aes(x=long, y = lat, group = group), fill = "green", color = "blue") + coord_fixed(1.3)
gg1

coors <- data.frame(long = c(-66.110000), lat = c(18.450000), stringsAsFactors = FALSE)
coors$cities <- cities 
gg1 + geom_point(data=coors, aes(long, lat), colour="red", size=1) +
  ggtitle("Puerto Rico") +
  geom_text_repel(data=coors, aes(long, lat, label=cities)) + xlim(-68,-64.5) + ylim(17.5,19)

LS0tCnRpdGxlOiAiRXh0cmFzIgphdXRob3I6ICJELl9TLl9GZXJuw6FuZGV6LWRlbC1WaXNvIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgojIyNUZXh0IEFuYWx5c2lzIC0gd29yZGNsb3VkCkdyYXBoaWMgYW5hbHlzaXMgb2YgYSB0ZXh0IHVzaW5nIHRoZSBwYWNrYWdlIF9fd29yZGNsb3VkX18KYGBge3J9CmxpYnJhcnkoTkxQKQphRmlsZSA9IHJlYWRMaW5lcygidGV4dGFuYWx5c2lzLnR4dCIpCmxpYnJhcnkodG0pCm15Q29ycHVzID0gQ29ycHVzKFZlY3RvclNvdXJjZShhRmlsZSkpCm15Q29ycHVzID0gdG1fbWFwKG15Q29ycHVzLCB0b2xvd2VyKQpteUNvcnB1cyA9IHRtX21hcChteUNvcnB1cywgcmVtb3ZlUHVuY3R1YXRpb24pCm15Q29ycHVzID0gdG1fbWFwKG15Q29ycHVzLCByZW1vdmVOdW1iZXJzKQpteUNvcnB1cyA9IHRtX21hcChteUNvcnB1cywgcmVtb3ZlV29yZHMsIHN0b3B3b3JkcygiZW5nbGlzaCIpKQpteURUTSA9IFRlcm1Eb2N1bWVudE1hdHJpeChteUNvcnB1cywgY29udHJvbCA9IGxpc3QobWluV29yZExlbmd0aCA9IDMpKQptID0gYXMubWF0cml4KG15RFRNKQp2ID0gc29ydChyb3dTdW1zKG0pLCBkZWNyZWFzaW5nID0gVFJVRSkKCmxpYnJhcnkod29yZGNsb3VkKQpzZXQuc2VlZCg0MzMpCndvcmRjbG91ZChuYW1lcyh2KSwgdiwgc2NhbGU9YygyLC4xKSwgbWluLmZyZXEgPSAyMCwgCiAgICAgICAgICBtYXgud29yZHMgPSA1MCwgY29sb3JzPWMoImdyZWVuIiwicmVkIiwiYmx1ZSIpLAogICAgICAgICAgcmFuZG9tLmNvbG9yID0gRkFMU0UpCmBgYAoKIyMjR2VuZXJhbCBMaW5lYWwgTW9kZWwgLSBMb2dpc3RpYyByZWdyZXNzaW9uClRoZSBtb2RlbCBpcyB0cmFpbmVkIHdpdGggaGlzdG9yaWNhbCBkYXRhLiAgV2l0aCB0aGUgbW9kZWwgcGFyYW1ldGVycyB5b3UgY2FuIG1ha2UgYSAicHJlZGljdGlvbiIgb2YgdGhlIHJlc3BvbnNlIHZhcmlhYmxlIHdpdGggbmV3IGRhdGEgKHByZWRpY3RvcnMpLgpgYGB7cn0KI3RyYWluaW5nIGRhdGEKY3VzdG9tZXIgPSByZWFkLmNzdignY3VzdG9tZXIuY3N2JywgaGVhZGVyPVRSVUUpCmN1c3RvbWVyCnN0cihjdXN0b21lcikKI2ZpdHRpbmcgdGhlIG1vZGVsCmxvZ2l0Zml0ID0gZ2xtKGJ1eSB+IHZpc2l0LnRpbWVzICsgYWdlICsgZ2VuZGVyLCBkYXRhPWN1c3RvbWVyLCBmYW1pbHk9Ymlub21pYWwobG9naXQpKQpzdW1tYXJ5KGxvZ2l0Zml0KQojcHJlZGljdGluZyBvdXRjb21lIHdpdGggbmV3IGRhdGEKQ3VzdG9tZXJJRCA8LSBjKDEsMiwzKQpnZW5kZXIgPC0gYygiRiIsIkYiLCJNIikKYWdlIDwtIGMoMzYsNTAsMjEpCnZpc2l0LnRpbWVzIDwtIGMoNSwxLDIpCmJ1eSA8LSBjKCIiLCIiLCIiKQpjdXN0b20yIDwtIGRhdGEuZnJhbWUoQ3VzdG9tZXJJRCxnZW5kZXIsYWdlLHZpc2l0LnRpbWVzLGJ1eSkKY3VzdG9tMgpwciA8LSBwcmVkaWN0KGxvZ2l0Zml0LCBjdXN0b20yLCB0eXBlPSJyZXNwb25zZSIpCnByCmBgYAoKIyMjTWFwcwpGcm9tIGEgd29ybGQgbWFwIHlvdSBjYW4gc2VsZWN0IGEgY291bnRyeSBhbmQgY2l0aWVzIHRvIGJlIHBsb3R0ZWQgaW4gYSBtYXAuCmBgYHtyfQpsaWJyYXJ5KG1hcHMpCmxpYnJhcnkobWFwZGF0YSkKbGlicmFyeShnZ3Bsb3QyKQpsaWJyYXJ5KGdncmVwZWwpCgpjaXRpZXMgPSBjKCJTYW4gSnVhbiIpCgpnbG9iYWwgPC0gbWFwX2RhdGEoIndvcmxkIikKCmdnMSA8LSBnZ3Bsb3QoKSArIGdlb21fcG9seWdvbihkYXRhID0gZ2xvYmFsLCBhZXMoeD1sb25nLCB5ID0gbGF0LCBncm91cCA9IGdyb3VwKSwgZmlsbCA9ICJncmVlbiIsIGNvbG9yID0gImJsdWUiKSArIGNvb3JkX2ZpeGVkKDEuMykKZ2cxCgpjb29ycyA8LSBkYXRhLmZyYW1lKGxvbmcgPSBjKC02Ni4xMTAwMDApLCBsYXQgPSBjKDE4LjQ1MDAwMCksIHN0cmluZ3NBc0ZhY3RvcnMgPSBGQUxTRSkKCmNvb3JzJGNpdGllcyA8LSBjaXRpZXMgCmdnMSArIGdlb21fcG9pbnQoZGF0YT1jb29ycywgYWVzKGxvbmcsIGxhdCksIGNvbG91cj0icmVkIiwgc2l6ZT0xKSArCiAgZ2d0aXRsZSgiUHVlcnRvIFJpY28iKSArCiAgZ2VvbV90ZXh0X3JlcGVsKGRhdGE9Y29vcnMsIGFlcyhsb25nLCBsYXQsIGxhYmVsPWNpdGllcykpICsgeGxpbSgtNjgsLTY0LjUpICsgeWxpbSgxNy41LDE5KQpgYGAKCgo=