1. Pre-Processing

Convert texts to a corpus. We can use several packages in R: tm, quanteda, etc.

# get docs
library(tm,quietly = T)
library(quanteda,quietly = T)
## Package version: 1.5.2
## Parallel computing: 2 of 12 threads used.
## See https://quanteda.io for tutorials and examples.
## 
## Attaching package: 'quanteda'
## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, stopwords
## The following object is masked from 'package:utils':
## 
##     View
data(data_char_ukimmig2010)
#head(data_char_ukimmig2010)

1.1. tm package

We focus on text.

library(tm,quietly = T)
cps = VCorpus(VectorSource(data_char_ukimmig2010))
cps
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 9
cps[[2]]$content
## [1] "IMMIGRATION. \n\nThe Government believes that immigration has enriched our culture and strengthened our economy, but that it must be controlled so that people have confidence in the system. We also recognise that to ensure cohesion and protect our public services, we need to introduce a cap on immigration and reduce the number of non-EU immigrants.\n\n- We will introduce an annual limit on the number of non-EU economic migrants admitted into the UK to live and work.\n\n- We will consider jointly the mechanism for implementing the limit.\n\n- We will end the detention of children for immigration purposes.\n\n- We will create a dedicated Border Police Force, as part of a refocused Serious Organised Crime Agency, to enhance national security, improve immigration controls and crack down on the trafficking of people, weapons and drugs. We will work with police forces to strengthen arrangements to deal with serious crime and other cross-boundary policing challenges, and extend collaboration between forces to deliver better value for money.\n\n- We support E-borders and will reintroduce exit checks.\n\n- We will apply transitional controls as a matter of course in the future for all new EU Member States.\n\n- We will introduce new measures to minimise abuse of the immigration system, for example via student routes, and will tackle human trafficking as a priority.\n\n- We will explore new ways to improve the current asylum system to speed up the processing of applications."
cps = tm_map(cps,scan_tokenizer) #tokenization
cps[[2]][1:10]
##  [1] "IMMIGRATION." "The"          "Government"   "believes"     "that"        
##  [6] "immigration"  "has"          "enriched"     "our"          "culture"
cps = tm_map(cps,tolower) # convert to lower case (with bugs!) quanteda::char_tolower("xxx"")
cps[[2]][1:10]
##  [1] "immigration." "the"          "government"   "believes"     "that"        
##  [6] "immigration"  "has"          "enriched"     "our"          "culture"
cps = tm_map(cps,removePunctuation) # remove punctuations
cps[[2]][1:10]
##  [1] "immigration" "the"         "government"  "believes"    "that"       
##  [6] "immigration" "has"         "enriched"    "our"         "culture"
cps = tm_map(cps,removeWords, stopwords("english")) # remove stopwords
cps[[2]][1:10]
##  [1] "immigration" ""            "government"  "believes"    ""           
##  [6] "immigration" ""            "enriched"    ""            "culture"
cps = tm_map(cps,stemDocument,language="english") #stemming
cps[[2]][1:10]
##  [1] "immigr" ""       "govern" "believ" ""       "immigr" ""       "enrich"
##  [9] ""       "cultur"
cps = tm_map(cps,removeNumbers) #remvoe numbers
cps[[2]][1:10]
##  [1] "immigr" ""       "govern" "believ" ""       "immigr" ""       "enrich"
##  [9] ""       "cultur"

Unique tokens

unique(cps[[2]])[1:10]
##  [1] "immigr"     ""           "govern"     "believ"     "enrich"    
##  [6] "cultur"     "strengthen" "economi"    "must"       "control"

Question: how to remove non-English characters?

1.2. Stanford core NLP (Lemma+Tags). It will be very slow!

# Sys.setenv(JAVA_HOME='C:\\Program Files\\Java\\jdk1.8.0_101')
# library(rJava)
# library(devtools)
# install_github("statsmaths/coreNLP")
# coreNLP::downloadCoreNLP()
# 
# library(coreNLP,quietly = T)
# initCoreNLP()
# output = annotateString(data_char_ukimmig2010)
# getToken(output) #lemmatization+POS+NER 
# cps = sapply(cps,jiebaR::segment,jiebaR::worker("tag"))

1.3. quanteda package

library(quanteda,quietly = T)
corp_uk = corpus(data_char_ukimmig2010)
docvars(corp_uk, "Party") <- names(data_char_ukimmig2010)
docvars(corp_uk, "Year") <- 2010
summary(corp_uk)
## Corpus consisting of 9 documents:
## 
##          Text Types Tokens Sentences        Party Year
##           BNP  1125   3280        88          BNP 2010
##     Coalition   142    260         4    Coalition 2010
##  Conservative   251    499        15 Conservative 2010
##        Greens   322    679        21       Greens 2010
##        Labour   298    683        29       Labour 2010
##        LibDem   251    483        14       LibDem 2010
##            PC    77    114         5           PC 2010
##           SNP    88    134         4          SNP 2010
##          UKIP   346    723        27         UKIP 2010
## 
## Source: D:/Dropbox/Teaching Materials/CUHK Courses/Digital Research/Slides/2020/Labs/* on x86-64 by user
## Created: Fri Feb 21 20:20:49 2020
## Notes:
summary(corpus_subset(corp_uk,Party=="BNP"))
## Corpus consisting of 1 document:
## 
##  Text Types Tokens Sentences Party Year
##   BNP  1125   3280        88   BNP 2010
## 
## Source: D:/Dropbox/Teaching Materials/CUHK Courses/Digital Research/Slides/2020/Labs/* on x86-64 by user
## Created: Fri Feb 21 20:20:49 2020
## Notes:
#texts(corp_uk)[1]
# txt -> tokens
A = tokens(data_char_ukimmig2010[1])
# corpus -> tokens
B = tokens(corpus(data_char_ukimmig2010[1]))
B[[1]][1:10]
##  [1] "IMMIGRATION"  ":"            "AN"           "UNPARALLELED" "CRISIS"      
##  [6] "WHICH"        "ONLY"         "THE"          "BNP"          "CAN"
tks = tokens_tolower(tokens(data_char_ukimmig2010))
tks = tokens_wordstem(tks)
tks = tokens(tks, remove_numbers = TRUE,  remove_punct = TRUE)
tks[[1]][1:10]
##  [1] "immigr"     "an"         "unparallel" "crisi"      "which"     
##  [6] "onli"       "the"        "bnp"        "can"        "solv"
# tks[1]
tk1 = tokens_remove(tks[1],stopwords('english'),padding = F) # remove "" or not.
tk1[[1]][1:10]
##  [1] "immigr"     "unparallel" "crisi"      "onli"       "bnp"       
##  [6] "can"        "solv"       "current"    "immigr"     "birth"

2. Vector Space Model

2.1. Document-Term Matrix

cps = tm_map(cps,PlainTextDocument)
dtm = DocumentTermMatrix(cps,control=list(weighting=weightTfIdf,
                                          #stemming = TRUE,
                                          stopwords = TRUE, 
                                          minWordLength = 3,
                                          tolower=FALSE,
                                          removePunctuation=T,
                                          removeNumbers=T)) 
dim(dtm)
## [1]    9 1167
colnames(dtm)[1:10]
##  [1] "abandon" "abid"    "abil"    "abl"     "abolish" "abolit"  "abroad" 
##  [8] "abscond" "absenc"  "absolut"
dtm=dtm[,c(3:10)]

OR

myStemMat <- dfm(corp_uk, 
                 remove = stopwords("english"), 
                 stem = TRUE, 
                 remove_numbers=TRUE,
                 remove_punct = TRUE)
dim(myStemMat)
## [1]    9 1174
dim(dfm_remove(myStemMat,min_nchar=3))
## [1]    9 1168
colnames(myStemMat)[1:20]
##  [1] "immigr"     "unparallel" "crisi"      "bnp"        "can"       
##  [6] "solv"       "current"    "birth"      "rate"       "indigen"   
## [11] "british"    "peopl"      "set"        "becom"      "minor"     
## [16] "well"       "within"     "year"       "result"     "extinct"

2.2. Keywords Selection

The most discriminant words between “HR” and “Sen”:

library(quanteda)
# summary(cps2)
word_list = textstat_keyness(myStemMat,target=docvars(corp_uk,'Party')=="Labour",measure="chi2")
rbind(head(word_list,10),tail(word_list,10))
##              feature       chi2            p n_target n_reference
## 1               wage 26.4820887 2.659930e-07        4           0
## 2               rise 25.9264920 3.546682e-07        6           3
## 3         points-bas 22.7384456 1.856168e-06        5           2
## 4             growth 19.8348034 8.443129e-06        4           1
## 5              level 18.8586655 1.407701e-05        5           3
## 6    australian-styl 17.7759366 2.485064e-05        3           0
## 7                see 17.7759366 2.485064e-05        3           0
## 8             public 13.4929507 2.394613e-04        5           5
## 9             servic 12.3324698 4.451471e-04        4           3
## 10             local 10.0294114 1.540603e-03        4           4
## 1165          seeker -0.4646357 4.954654e-01        0          12
## 1166            must -0.4646357 4.954654e-01        0          12
## 1167          number -0.5595638 4.544361e-01        0          13
## 1168          polici -0.5595638 4.544361e-01        0          13
## 1169          deport -0.6570848 4.175910e-01        0          14
## 1170             bnp -0.9607494 3.269981e-01        0          17
## 1171           shall -0.9607494 3.269981e-01        0          17
## 1172           popul -1.4892780 2.223285e-01        0          22
## 1173         countri -2.0331582 1.539005e-01        0          27
## 1174              uk -2.1431333 1.432091e-01        0          28
# Plot estimated word keyness
textplot_keyness(word_list) 

Try another method and put all together:

word_list = textstat_keyness(dfm(corpus(data_char_ukimmig2010),remove_punct = TRUE,
                                 stem = T,remove = stopwords('english')),
                             target=docvars(corp_uk,'Party')=="Conservative",
                             measure="lr")
rbind(head(word_list,10),tail(word_list,10))
##         feature        G2            p n_target n_reference
## 1       student 24.880013 6.101170e-07        8           4
## 2          want  9.466485 2.092599e-03        4           3
## 3      institut  6.753476 9.356523e-03        3           2
## 4       serious  6.753476 9.356523e-03        3           2
## 5       attract  6.753476 9.356523e-03        3           2
## 6     brightest  5.812517 1.591250e-02        2           0
## 7          best  5.812517 1.591250e-02        2           0
## 8        colleg  5.812517 1.591250e-02        2           0
## 9          come  5.678286 1.717614e-02        3           3
## 10         forc  4.178668 4.093586e-02        3           5
## 1223      resid -1.015888 3.134963e-01        0           7
## 1224     servic -1.015888 3.134963e-01        0           7
## 1225       home -1.015888 3.134963e-01        0           7
## 1226    benefit -1.015888 3.134963e-01        0           7
## 1227     detent -1.015888 3.134963e-01        0           7
## 1228       fair -1.015888 3.134963e-01        0           7
## 1229 points-bas -1.015888 3.134963e-01        0           7
## 1230     asylum -1.750795 1.857768e-01        0          29
## 1231     immigr -2.084076 1.488423e-01        3          84
## 1232    britain -2.437630 1.184550e-01        0          35

2.3. Distance & Similarity

dim(myStemMat)
## [1]    9 1174
Simil = textstat_simil(myStemMat, margin = "documents", method = "cosine")
m=as.matrix(Simil)
m[1:5,1:5]
##                    BNP Coalition Conservative    Greens    Labour
## BNP          1.0000000 0.3559692    0.3044682 0.4363564 0.4143731
## Coalition    0.3559692 1.0000000    0.5384642 0.3967000 0.4512907
## Conservative 0.3044682 0.5384642    1.0000000 0.3193178 0.3869766
## Greens       0.4363564 0.3967000    0.3193178 1.0000000 0.4011530
## Labour       0.4143731 0.4512907    0.3869766 0.4011530 1.0000000

Similarity using matrix tf-idf:

Simil2 = textstat_simil(dfm_tfidf(myStemMat), margin = "documents", method = "cosine")
m2=as.matrix(Simil2)
m2[1:5,1:5]
##                     BNP  Coalition Conservative     Greens     Labour
## BNP          1.00000000 0.05819361   0.07696592 0.13400324 0.09599230
## Coalition    0.05819361 1.00000000   0.34525904 0.07695841 0.07184202
## Conservative 0.07696592 0.34525904   1.00000000 0.08921872 0.14580452
## Greens       0.13400324 0.07695841   0.08921872 1.00000000 0.09339164
## Labour       0.09599230 0.07184202   0.14580452 0.09339164 1.00000000

Now, use tf-idf matrix to calculate the similarity scores:

rm = dfm_tfidf(myStemMat)
rm = convert(rm,to="matrix")
rm[1:5,1:5]
##               features
## docs           immigr unparallel    crisi      bnp       can
##   BNP               0  0.9542425 1.908485 16.22212 0.1091445
##   Coalition         0  0.0000000 0.000000  0.00000 0.0000000
##   Conservative      0  0.0000000 0.000000  0.00000 0.2182889
##   Greens            0  0.0000000 0.000000  0.00000 0.1091445
##   Labour            0  0.0000000 0.000000  0.00000 0.1091445
library(lsa,quietly = T)
cosine(rm[3,],rm[4,])
##            [,1]
## [1,] 0.08921872

Without weighting:

rm = convert(myStemMat,to="matrix")
cosine(rm[3,],rm[4,])
##           [,1]
## [1,] 0.3193178

NOTE: Please use the tf-idf matrix to calculate cosine similarity!

#distance
d1 = as.matrix(textstat_dist(myStemMat,margin = 'documents'))
d1[1:5,1:5]
##                   BNP Coalition Conservative   Greens   Labour
## BNP           0.00000  89.53770     89.58795 85.65045 85.71464
## Coalition    89.53770   0.00000     18.68154 24.55606 27.71281
## Conservative 89.58795  18.68154      0.00000 28.28427 30.24897
## Greens       85.65045  24.55606     28.28427  0.00000 31.60696
## Labour       85.71464  27.71281     30.24897 31.60696  0.00000
d2 = as.matrix(textstat_dist(dfm_tfidf(myStemMat),margin = 'documents')) # a two-fold normalization
d2[1:5,1:5]
##                       BNP Coalition Conservative       Greens   Labour
## BNP          1.279488e-05 57.923303    58.066400 5.774702e+01 58.33971
## Coalition    5.792330e+01  0.000000     9.684996 1.438716e+01 14.89768
## Conservative 5.806640e+01  9.684996     0.000000 1.617972e+01 16.10513
## Greens       5.774702e+01 14.387165    16.179721 2.384190e-07 18.70952
## Labour       5.833971e+01 14.897679    16.105127 1.870952e+01  0.00000
d3 = as.matrix(textstat_dist(dfm_weight(myStemMat,"prop"),margin = 'documents')) #normalized distance
d3[1:5,1:5]
##                     BNP Coalition Conservative     Greens       Labour
## BNP          0.00000000 0.1104332 9.128099e-02 0.07761299 8.578865e-02
## Coalition    0.11043323 0.0000000 1.021237e-01 0.11318076 1.112240e-01
## Conservative 0.09128099 0.1021237 5.268356e-09 0.10097129 1.005994e-01
## Greens       0.07761299 0.1131808 1.009713e-01 0.00000000 9.558398e-02
## Labour       0.08578865 0.1112240 1.005994e-01 0.09558398 7.450581e-09

2.4. KWIC (Key Words In Context) & Collocations

head(kwic(corpus(data_char_ukimmig2010), "immigration", window = 3, valuetype = "fixed"))
##                                                                         
##    [BNP, 1]                        | IMMIGRATION | : AN UNPARALLELED    
##   [BNP, 16]           - At current | immigration | and birth rates      
##   [BNP, 78]         to all further | immigration | , the deportation    
##  [BNP, 169]    regardless of their | immigration | status.-             
##  [BNP, 197] they orchestrated mass | immigration | to change forcibly   
##  [BNP, 272]        , threatened by | immigration | and multiculturalism.

Collocation:

toks_c <- tokens(char_tolower(data_char_ukimmig2010), remove_numbers=TRUE, remove_punct=TRUE)
toks_c <- tokens_remove(toks_c, stopwords("english"))
colls <- textstat_collocations(toks_c, size = 2)
head(colls[order(colls$count, decreasing=TRUE),], n=10)
##             collocation count count_nested length   lambda        z
## 123      asylum seekers    10            0      2 8.183483 5.585220
## 2          human rights     7            0      2 8.297095 8.586934
## 8        british people     7            0      2 3.345763 7.453235
## 145 points-based system     7            0      2 7.797602 5.278280
## 1    illegal immigrants     6            0      2 5.525430 9.029779
## 3   national statistics     5            0      2 6.313671 8.340047
## 4       office national     5            0      2 6.313671 8.340047
## 17        shall abolish     5            0      2 6.886232 7.145361
## 50  illegal immigration     5            0      2 3.609867 6.409900
## 158  immigration system     5            0      2 2.563670 5.202498

2.5. Diveristy and Complexity

# readability
fk <- textstat_readability(corp_uk, "Flesch.Kincaid")
fk
##       document Flesch.Kincaid
## 1          BNP       18.17927
## 2    Coalition       28.01016
## 3 Conservative       15.29153
## 4       Greens       16.35716
## 5       Labour       12.26813
## 6       LibDem       17.15433
## 7           PC       14.55468
## 8          SNP       15.66630
## 9         UKIP       14.02895
# complexity
ld <- textstat_lexdiv(myStemMat, "TTR")
ld
##       document       TTR
## 1          BNP 0.4726027
## 2    Coalition 0.7829457
## 3 Conservative 0.7024793
## 4       Greens 0.7044025
## 5       Labour 0.6106195
## 6       LibDem 0.6975806
## 7           PC 0.8644068
## 8          SNP 0.8208955
## 9         UKIP 0.6225352

2.6. Hypothesis Testing (Bootstrapping)

library(boot)

doc = data_char_ukimmig2010[1]
tks = tokens(doc,what='sentence')
data = tks[[1]]

foo <- function(data, indices){
  dt<-paste(data[indices],collapse = " ")
  corp_uk = corpus(dt)
  fk <- textstat_readability(corp_uk, "Flesch.Kincaid")
  fk$Flesch.Kincaid
}

#set.seed(12345)
myBootstrap <- boot(data, foo, R=100)
plot(myBootstrap)

boot.ci(myBootstrap)
## Warning in boot.ci(myBootstrap): bootstrap variances needed for studentized
## intervals
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 100 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = myBootstrap)
## 
## Intervals : 
## Level      Normal              Basic         
## 95%   (15.16, 20.69 )   (14.27, 20.11 )  
## 
## Level     Percentile            BCa          
## 95%   (16.25, 22.09 )   (16.27, 22.10 )  
## Calculations and Intervals on Original Scale
## Some basic intervals may be unstable
## Some percentile intervals may be unstable
## Some BCa intervals may be unstable

Let’s test the significance of the difference between Conservative (3) and Green (4)

docA = data_char_ukimmig2010[3]
tksA = tokens(docA,what='sentence')
dataA = tksA[[1]]
dataA = data.frame("text"=dataA,"party"="Conservative",stringsAsFactors = F)

docB = data_char_ukimmig2010[4]
tksB = tokens(docB,what='sentence')
dataB = tksB[[1]]
dataB = data.frame("text"=dataB,"party"="Green",stringsAsFactors = F)

data = rbind(dataA,dataB)

foo <- function(data,indices){
  sd = data[indices,]
  
  dataA = sd[sd$party=="Conservative",]
  dtA<-paste(dataA$text,collapse = " ")
  corp_A = corpus(dtA)
  fkA <- textstat_readability(corp_A, "Flesch.Kincaid")
  
  dataB = sd[sd$party=="Green",]
  dtB<-paste(dataB$text,collapse = " ")
  corp_B = corpus(dtB)
  fkB <- textstat_readability(corp_B, "Flesch.Kincaid")
  
  fkA$Flesch.Kincaid - fkB$Flesch.Kincaid
}

#set.seed(12345)
myBootstrap <- boot(data, foo, R=100)
plot(myBootstrap)

boot.ci(myBootstrap)
## Warning in boot.ci(myBootstrap): bootstrap variances needed for studentized
## intervals
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 100 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = myBootstrap)
## 
## Intervals : 
## Level      Normal              Basic         
## 95%   (-5.972,  4.569 )   (-6.000,  5.660 )  
## 
## Level     Percentile            BCa          
## 95%   (-7.792,  3.869 )   (-5.697,  4.347 )  
## Calculations and Intervals on Original Scale
## Some basic intervals may be unstable
## Some percentile intervals may be unstable
## Some BCa intervals may be unstable

3. Unsupervised Learning

#devtools::install_github("quanteda/quanteda.corpora")
library(quanteda)
library(quanteda.corpora)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(topicmodels)
corp_news <- download('data_corpus_guardian')
corp_news <- corpus_subset(corp_news, date >= as.Date("2016-10-01"))
ndoc(corp_news)
## [1] 241

3.1. Clustering

We can identify documents that are similar to one another based on the frequency of words, using similarity. There’s different metrics to compute similarity. Here we explore two of them: Jaccard distance and Cosine distance.

myStemMat <- dfm(corp_news, 
                 remove = stopwords("english"), 
                 stem = TRUE, 
                 remove_numbers=TRUE,
                 remove_punct = TRUE)
dim(myStemMat)
## [1]   241 11307
dim(dfm_remove(myStemMat,min_nchar=3))
## [1]   241 11155
# compute distances
distances <- textstat_dist(dfm_weight(myStemMat,"prop"),margin = 'documents') #nromalized distance
as.matrix(distances)[1:5, 1:5]
##            text184646 text181782 text182271 text180150 text181097
## text184646  0.0000000  0.1267562  0.1517591  0.1302946  0.1734782
## text181782  0.1267562  0.0000000  0.1331665  0.1125530  0.1581297
## text182271  0.1517591  0.1331665  0.0000000  0.1360549  0.1801829
## text180150  0.1302946  0.1125530  0.1360549  0.0000000  0.1634364
## text181097  0.1734782  0.1581297  0.1801829  0.1634364  0.0000000
# clustering
cluster <- hclust(as.dist(distances))
memb <- cutree(cluster, k = 10)
table(memb,docvars(myStemMat)$section)
## < table of extent 10 x 0 >
plot(cluster)

# draw dendogram with red borders around the 5 clusters 
# rect.hclust(cluster, k=20, border="red")

3.2. Dimensionality Reduction (PCA+CA)

A different type of clustering is principal component analysis. This technique will try to identify a set of uncorrelated variables that capture most of the variance in the document-term matrix. The first component will always capture the largest proportion of the variance; the second captures the second largest, etc. Looking at the relative proportion of the variance captured by the first component vs the rest, we can see to what extent we can reduce the data set to just one dimension.

# Principal components analysis
pca <- prcomp(t(as.matrix(dfm_tfidf(myStemMat)))) 
plot(pca) # first PC captures most of the variance

# plot first principal component
plot(pca$rotation[,1], pca$rotation[,2], type="n")
text(pca$rotation[,1], pca$rotation[,2], labels=docvars(myStemMat)$major)

# looking at features for each PC
df <- data.frame(
  featname = featnames(myStemMat),
  dim1 = pca$x[,1],
  dim2 = pca$x[,2],
  stringsAsFactors=FALSE
)

head(df[order(df$dim1),])
##          featname       dim1          dim2
## trump       trump -0.6771307 -1.2294342047
## landlord landlord -0.6721003 -0.1240832287
## oyster     oyster -0.6235862 -0.2852365111
## clinton   clinton -0.6024442 -0.5931029013
## tax           tax -0.5982343 -0.0792776523
## mortgag   mortgag -0.5374781  0.0004140003

A similar dimensionality reduction technique is correspondence analysis.

out <- textmodel_ca(dfm_tfidf(myStemMat))

# documents
df <- data.frame(
  docname = docnames(dfm_tfidf(myStemMat)),
  dim1 = out$rowcoord[,1],
  dim2 = out$rowcoord[,2],
  stringsAsFactors=FALSE
)

head(df[order(df$dim1),])
##               docname       dim1       dim2
## text184558 text184558 -0.1900521 -0.7418218
## text183518 text183518 -0.1899572 -0.3978320
## text183771 text183771 -0.1896604  0.7165693
## text187736 text187736 -0.1891202 -0.4426383
## text183060 text183060 -0.1887915  0.2083412
## text185581 text185581 -0.1886220  0.3889564
plot(df$dim1, df$dim2, type="n")
text(df$dim1, df$dim2, labels=df$year)

# features
df <- data.frame(
  featname = featnames(myStemMat),
  dim1 = out$colcoord[,1],
  dim2 = out$colcoord[,2],
  stringsAsFactors=FALSE
)

head(df[order(df$dim1),])
##                  featname       dim1     dim2
## 420mg               420mg -0.1966584 -1.02433
## vial                 vial -0.1966584 -1.02433
## vat                   vat -0.1966584 -1.02433
## herceptin       herceptin -0.1966584 -1.02433
## docetaxel       docetaxel -0.1966584 -1.02433
## chemotherapi chemotherapi -0.1966584 -1.02433

3.3. LDA

The general procedure:

library(topicmodels)
quantdfm <- dfm(corp_news, 
                remove_punct = TRUE, remove_numbers = TRUE, remove = stopwords("english"))
quantdfm <- dfm_trim(quantdfm, min_termfreq = 4, max_docfreq = 10)
quantdfm
## Document-feature matrix of: 241 documents, 3,495 features (98.0% sparse).
K=10
mtx = convert(quantdfm, to = "topicmodels")
lda <- LDA(mtx, k = K, method = "Gibbs", 
                control = list(verbose=25L, seed = 123, burnin = 100, iter = 500,keep=50))
## K = 10; V = 3495; M = 241
## Sampling 600 iterations!
## Iteration 25 ...
## Iteration 50 ...
## Iteration 75 ...
## Iteration 100 ...
## Iteration 125 ...
## Iteration 150 ...
## Iteration 175 ...
## Iteration 200 ...
## Iteration 225 ...
## Iteration 250 ...
## Iteration 275 ...
## Iteration 300 ...
## Iteration 325 ...
## Iteration 350 ...
## Iteration 375 ...
## Iteration 400 ...
## Iteration 425 ...
## Iteration 450 ...
## Iteration 475 ...
## Iteration 500 ...
## Iteration 525 ...
## Iteration 550 ...
## Iteration 575 ...
## Iteration 600 ...
## Gibbs sampling completed!
get_terms(lda, 10)
##       Topic 1     Topic 2     Topic 3      Topic 4     Topic 5     
##  [1,] "corbyn"    "landlords" "plastic"    "muslim"    "castro"    
##  [2,] "abuse"     "income"    "indigenous" "muslims"   "alcohol"   
##  [3,] "taylor"    "fbi"       "coal"       "mining"    "cuba"      
##  [4,] "miss"      "fees"      "refugees"   "bridge"    "gas"       
##  [5,] "lady"      "rent"      "berlin"     "census"    "island"    
##  [6,] "heathrow"  "emails"    "emissions"  "fake"      "bars"      
##  [7,] "lord"      "buildings" "collect"    "sex"       "latin"     
##  [8,] "expansion" "weiner"    "urban"      "moving"    "castro's"  
##  [9,] "supreme"   "wealth"    "waste"      "terrorist" "african"   
## [10,] "select"    "mortgage"  "injured"    "travel"    "revolution"
##       Topic 6       Topic 7        Topic 8   Topic 9      Topic 10    
##  [1,] "ukip"        "nhs"          "de"      "syrian"     "google"    
##  [2,] "immigrants"  "labor"        "la"      "taliban"    "ireland"   
##  [3,] "fire"        "turnbull"     "el"      "aid"        "windows"   
##  [4,] "trafficking" "pension"      "en"      "resolution" "digital"   
##  [5,] "county"      "requirements" "los"     "arms"       "app"       
##  [6,] "protesters"  "visa"         "n"       "pakistan"   "nice"      
##  [7,] "warehouse"   "visas"        "que"     "aleppo"     "breast"    
##  [8,] "rosser"      "canada"       "y"       "peace"      "patients"  
##  [9,] "beijing"     "percentile"   "oysters" "girls"      "devices"   
## [10,] "resort"      "granted"      "oyster"  "girl"       "healthcare"
topics <- get_topics(lda, 1)
head(topics)
## text184646 text181782 text182271 text180150 text181097 text183267 
##          6          8          1          4          4          4

Two indicators for model fit:

pp = perplexity(lda,convert(quantdfm, to = "topicmodels"))
## K = 10; V = 3495; M = 241
## Sampling 600 iterations!
## Iteration 25 ...
## Iteration 50 ...
## Iteration 75 ...
## Iteration 100 ...
## Iteration 125 ...
## Iteration 150 ...
## Iteration 175 ...
## Iteration 200 ...
## Iteration 225 ...
## Iteration 250 ...
## Iteration 275 ...
## Iteration 300 ...
## Iteration 325 ...
## Iteration 350 ...
## Iteration 375 ...
## Iteration 400 ...
## Iteration 425 ...
## Iteration 450 ...
## Iteration 475 ...
## Iteration 500 ...
## Iteration 525 ...
## Iteration 550 ...
## Iteration 575 ...
## Iteration 600 ...
## Gibbs sampling completed!
## K = 10; V = 3495; M = 241
## Sampling 100 iterations!
## Iteration 25 ...
## Iteration 50 ...
## Iteration 75 ...
## Iteration 100 ...
## Gibbs sampling completed!
ll = logLik(lda)
print(pp)
## [1] 1503.15
print(ll)
## 'log Lik.' -187700 (df=34950)
lda@logLiks[-c(1:(100/50))]
##  [1] -187866.2 -187881.2 -187887.0 -187493.0 -187551.1 -187788.5 -187271.3
##  [8] -187648.1 -187829.5 -187700.0

Select K based on pp and ll:

burnin = 1000
iter = 1000
keep = 50
# generate numerous topic models with different numbers of topics
sequ <- seq(0, 20, 5)[2:5] # in this case a sequence of numbers from 1 to 50, by ones.
fitted_many <- lapply(sequ, function(k) LDA(mtx, k = k, method = "Gibbs",
                                            control = list(burnin = burnin, iter = iter, keep = keep)))
#save(fitted_many,file='TopicModels_f.Rdata')

Extract logliks from each topic

logLiks_many <- lapply(fitted_many, function(L)  L@logLiks[-c(1:(burnin/keep))])
# compute harmonic means
# install.packages('psych')
library(psych)
## 
## Attaching package: 'psych'
## The following object is masked from 'package:boot':
## 
##     logit
hm_many <- sapply(logLiks_many, function(h) harmonic.mean(h))
# inspect
plot(sequ, hm_many, type = "l")

# compute optimum number of topics 15
sequ[which.max(hm_many)]
## [1] 20
final <- fitted_many[[3]]
get_terms(final,10)
##       Topic 1     Topic 2         Topic 3 Topic 4     Topic 5      Topic 6  
##  [1,] "buildings" "requirements"  "de"    "urban"     "indigenous" "google" 
##  [2,] "bridge"    "heathrow"      "la"    "berlin"    "girls"      "ireland"
##  [3,] "rail"      "resort"        "el"    "sex"       "girl"       "digital"
##  [4,] "select"    "percentile"    "en"    "rosser"    "taylor"     "windows"
##  [5,] "sleep"     "expansion"     "los"   "injured"   "lady"       "app"    
##  [6,] "ticket"    "aircraft"      "n"     "german"    "supreme"    "devices"
##  [7,] "moving"    "airport"       "que"   "christian" "aboriginal" "samsung"
##  [8,] "websites"  "self-employed" "y"     "crowd"     "vaccine"    "names"  
##  [9,] "sites"     "entry"         "beach" "driver"    "gender"     "fake"   
## [10,] "khan"      "club"          "las"   "abrini"    "strait"     "tech"   
##       Topic 7      Topic 8      Topic 9        Topic 10      Topic 11  
##  [1,] "muslim"     "pension"    "alcohol"      "plastic"     "castro"  
##  [2,] "muslims"    "mining"     "corbyn"       "gas"         "fbi"     
##  [3,] "immigrants" "fire"       "bars"         "trafficking" "cuba"    
##  [4,] "syrian"     "county"     "lewis"        "collect"     "weiner"  
##  [5,] "aid"        "protesters" "productivity" "oysters"     "island"  
##  [6,] "arms"       "uber"       "dry"          "waste"       "emails"  
##  [7,] "refugees"   "cars"       "unite"        "oyster"      "latin"   
##  [8,] "aleppo"     "warehouse"  "sectors"      "shale"       "castro's"
##  [9,] "census"     "vehicles"   "bar"          "clean"       "sanders" 
## [10,] "peace"      "dunlop"     "travel"       "bags"        "podesta" 
##       Topic 12   Topic 13       Topic 14    Topic 15    
##  [1,] "ukip"     "nhs"          "labor"     "landlords" 
##  [2,] "taliban"  "nice"         "turnbull"  "income"    
##  [3,] "pakistan" "breast"       "coal"      "fees"      
##  [4,] "beijing"  "patients"     "visa"      "rent"      
##  [5,] "king"     "winter"       "emissions" "mortgage"  
##  [6,] "abuse"    "species"      "visas"     "wealth"    
##  [7,] "rogers"   "stein"        "poverty"   "resolution"
##  [8,] "hong"     "independence" "canada"    "credit"    
##  [9,] "farage"   "voting"       "mexico"    "hammond"   
## [10,] "woolfe"   "wisconsin"    "shorten"   "rental"

Select K: 5-fold cross-validation, different numbers of topics

library(cvTools)
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
## 
##     melanoma
## Loading required package: robustbase
## 
## Attaching package: 'robustbase'
## The following object is masked from 'package:psych':
## 
##     cushny
## The following object is masked from 'package:boot':
## 
##     salinity
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:lubridate':
## 
##     intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# dtm is the doc-term matrix (convert first)
cvLDA <- function(Ntopics,dtm,K=5) {
  folds<-cvFolds(nrow(dtm),K,1)
  perplex <- rep(NA,K)
  llk <- rep(NA,K)
  for(i in unique(folds$which)){
    cat(i, " ")
    which.test <- folds$subsets[folds$which==i]
    which.train <- {1:nrow(dtm)}[-which.test]
    dtm.train <- dtm[which.train,]
    dtm.test <- dtm[which.test,]
    lda.fit <- LDA(dtm.train, k=Ntopics, method="Gibbs",control=list(verbose=50L, iter=100))
    perplex[i] <- perplexity(lda.fit, dtm.test)
    llk[i] <- logLik(lda.fit)
  }
  res = data.frame(n=rep(Ntopics,K),perplexity=perplex,logLik=llk)
  return(res)
}

# now test:
N <- c(20, 30, 40, 50, 60, 70)
results <- data.frame()
for (n in N){
    cat("\n\n\n##########\n ", n, "topics", "\n")
    res <- cvLDA(n, mtx)
    results <- bind_rows(results,res)
}
## 
## 
## 
## ##########
##   20 topics 
## 1  K = 20; V = 3495; M = 192
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 20; V = 3495; M = 49
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 2  K = 20; V = 3495; M = 193
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 20; V = 3495; M = 48
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 3  K = 20; V = 3495; M = 193
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 20; V = 3495; M = 48
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 4  K = 20; V = 3495; M = 193
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 20; V = 3495; M = 48
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 5  K = 20; V = 3495; M = 193
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 20; V = 3495; M = 48
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 
## 
## 
## ##########
##   30 topics 
## 1  K = 30; V = 3495; M = 192
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 30; V = 3495; M = 49
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 2  K = 30; V = 3495; M = 193
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 30; V = 3495; M = 48
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 3  K = 30; V = 3495; M = 193
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 30; V = 3495; M = 48
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 4  K = 30; V = 3495; M = 193
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 30; V = 3495; M = 48
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 5  K = 30; V = 3495; M = 193
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 30; V = 3495; M = 48
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 
## 
## 
## ##########
##   40 topics 
## 1  K = 40; V = 3495; M = 192
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 40; V = 3495; M = 49
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 2  K = 40; V = 3495; M = 193
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 40; V = 3495; M = 48
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 3  K = 40; V = 3495; M = 193
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 40; V = 3495; M = 48
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 4  K = 40; V = 3495; M = 193
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 40; V = 3495; M = 48
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 5  K = 40; V = 3495; M = 193
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 40; V = 3495; M = 48
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 
## 
## 
## ##########
##   50 topics 
## 1  K = 50; V = 3495; M = 192
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 50; V = 3495; M = 49
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 2  K = 50; V = 3495; M = 193
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 50; V = 3495; M = 48
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 3  K = 50; V = 3495; M = 193
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 50; V = 3495; M = 48
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 4  K = 50; V = 3495; M = 193
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 50; V = 3495; M = 48
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 5  K = 50; V = 3495; M = 193
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 50; V = 3495; M = 48
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 
## 
## 
## ##########
##   60 topics 
## 1  K = 60; V = 3495; M = 192
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 60; V = 3495; M = 49
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 2  K = 60; V = 3495; M = 193
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 60; V = 3495; M = 48
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 3  K = 60; V = 3495; M = 193
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 60; V = 3495; M = 48
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 4  K = 60; V = 3495; M = 193
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 60; V = 3495; M = 48
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 5  K = 60; V = 3495; M = 193
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 60; V = 3495; M = 48
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 
## 
## 
## ##########
##   70 topics 
## 1  K = 70; V = 3495; M = 192
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 70; V = 3495; M = 49
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 2  K = 70; V = 3495; M = 193
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 70; V = 3495; M = 48
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 3  K = 70; V = 3495; M = 193
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 70; V = 3495; M = 48
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 4  K = 70; V = 3495; M = 193
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 70; V = 3495; M = 48
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 5  K = 70; V = 3495; M = 193
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 70; V = 3495; M = 48
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
results$ratio_perp1 <- results$perplexity / max(results$perplexity)
results$ratio_lk1 <- results$logLik / min(results$logLik)

df <- results %>% group_by(n) %>%
  summarise(ratio_perp=mean(ratio_perp1),sd_perp=sd(ratio_perp1),
            ratio_lk=mean(ratio_lk1),sd_lk=sd(ratio_lk1))

df
## # A tibble: 6 x 5
##       n ratio_perp sd_perp ratio_lk   sd_lk
##   <dbl>      <dbl>   <dbl>    <dbl>   <dbl>
## 1    20      0.958  0.0319    0.978 0.00665
## 2    30      0.882  0.0260    0.975 0.0117 
## 3    40      0.836  0.0276    0.974 0.0192 
## 4    50      0.797  0.0216    0.975 0.0231 
## 5    60      0.776  0.0171    0.977 0.0102 
## 6    70      0.755  0.0299    0.980 0.0141
pd = df[,1:3]
colnames(pd)=c('n','value',"sd")
pd$variable = "Perplexity"
df$variable = "LogLikelihood"
colnames(df)[4:5]=c('value',"sd")
pd = bind_rows(pd,df[,c(1,4:6)])
pd
## # A tibble: 12 x 4
##        n value      sd variable     
##    <dbl> <dbl>   <dbl> <chr>        
##  1    20 0.958 0.0319  Perplexity   
##  2    30 0.882 0.0260  Perplexity   
##  3    40 0.836 0.0276  Perplexity   
##  4    50 0.797 0.0216  Perplexity   
##  5    60 0.776 0.0171  Perplexity   
##  6    70 0.755 0.0299  Perplexity   
##  7    20 0.978 0.00665 LogLikelihood
##  8    30 0.975 0.0117  LogLikelihood
##  9    40 0.974 0.0192  LogLikelihood
## 10    50 0.975 0.0231  LogLikelihood
## 11    60 0.977 0.0102  LogLikelihood
## 12    70 0.980 0.0141  LogLikelihood
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
## The following object is masked from 'package:NLP':
## 
##     annotate
library(grid)

p <- ggplot(pd, aes(x=n, y=value, linetype=variable))
pq <- p + geom_line() + geom_point(aes(shape=variable), 
        fill="white", shape=21, size=1.40) +
    geom_errorbar(aes(ymax=value+sd, ymin=value-sd), width=4) +
    scale_y_continuous("Ratio wrt worst value") +
    scale_x_continuous("Number of topics", 
        breaks=K) +
    theme_bw() 
pq