Convert texts to a corpus. We can use several packages in R: tm, quanteda, etc. Here we introduce quanteda
, which is more convenient for our tasks:
# load library and data
library(quanteda,quietly = T)
library(quanteda.textstats) # keyness readability etc.
library(quanteda.textplots)
library(quanteda.textmodels) # pca/ca/lda
library(lubridate) # time processing
To convert text documents to corpus using qunateda::corpus()
:
# devtools::install_github("quanteda/quanteda.corpora")
# library(quanteda.corpora)
# corp_news <- download('data_corpus_guardian')
load("docs.Rdata")
docs$year = year(docs$date)
# texts -> corpus
corp_news = corpus(docs$texts) # the input could be a data.frame (set text_field)
# set the meta variable party & year:
docvars(corp_news, "pub") <- docs$pub
docvars(corp_news, "year") <- docs$year
summary(corp_news[1:10])
## Corpus consisting of 10 documents, showing 10 documents:
##
## Text Types Tokens Sentences pub year
## text1 51 68 4 The Guardian 2016
## text2 413 1052 41 The Guardian 2015
## text3 49 83 1 The Guardian 2014
## text4 348 866 27 The Guardian 2015
## text5 986 2768 75 The Guardian 2016
## text6 522 1271 43 The Guardian 2014
## text7 551 1505 56 The Guardian 2015
## text8 245 544 23 The Guardian - Final Edition 2013
## text9 158 260 6 The Guardian - Final Edition 2013
## text10 424 1030 37 The Guardian 2016
summary(corpus_subset(corp_news,year==2016)[1:10])
## Corpus consisting of 10 documents, showing 10 documents:
##
## Text Types Tokens Sentences pub year
## text1 51 68 4 The Guardian 2016
## text5 986 2768 75 The Guardian 2016
## text10 424 1030 37 The Guardian 2016
## text11 408 1111 45 The Guardian 2016
## text13 556 1261 39 The Guardian 2016
## text15 518 1384 53 The Guardian 2016
## text17 513 1128 33 The Guardian 2016
## text19 324 683 19 The Guardian 2016
## text25 300 734 21 The Guardian 2016
## text26 232 407 16 The Guardian 2016
corp_news = corpus(docs[,c("texts","pub","year")],text_field = "texts")
summary(corp_news[1:10])
## Corpus consisting of 10 documents, showing 10 documents:
##
## Text Types Tokens Sentences pub year
## text136751 51 68 4 The Guardian 2016
## text118588 413 1052 41 The Guardian 2015
## text45146 49 83 1 The Guardian 2014
## text93623 348 866 27 The Guardian 2015
## text136585 986 2768 75 The Guardian 2016
## text65682 522 1271 43 The Guardian 2014
## text107174 551 1505 56 The Guardian 2015
## text22792 245 544 23 The Guardian - Final Edition 2013
## text32425 158 260 6 The Guardian - Final Edition 2013
## text139163 424 1030 37 The Guardian 2016
docvars(corp_news,"pub")[1:10]
## [1] "The Guardian" "The Guardian"
## [3] "The Guardian" "The Guardian"
## [5] "The Guardian" "The Guardian"
## [7] "The Guardian" "The Guardian - Final Edition"
## [9] "The Guardian - Final Edition" "The Guardian"
To check the first document:
as.character(corp_news[1])
## text136751
## "London masterclass on climate change | Do you want to understand more about climate change? On 14 March the Guardian's assistant national news editor James Randerson is giving a masterclass on the topic in London. He'll introduce the science and talk about predictions for what impact the changing climate will have - and discuss what we can do about it. And finally..."
To convert documents to tokens:
# txt -> tokens
A = quanteda::tokens(docs$texts[1])
A[[1]][1:20]
## [1] "London" "masterclass" "on" "climate" "change"
## [6] "|" "Do" "you" "want" "to"
## [11] "understand" "more" "about" "climate" "change"
## [16] "?" "On" "14" "March" "the"
# corpus -> tokens
B = quanteda::tokens(corp_news[1])
B[[1]][1:20]
## [1] "London" "masterclass" "on" "climate" "change"
## [6] "|" "Do" "you" "want" "to"
## [11] "understand" "more" "about" "climate" "change"
## [16] "?" "On" "14" "March" "the"
Preprocess the tokens:
# convert to lower cases
tks_lower = tokens_tolower(quanteda::tokens(corp_news))
# steming
tks_stem = tokens_wordstem(tks_lower)
# or lemmatization [textstem::lemmatize_strings]
tks_lemma = tokens_replace(tks_lower, pattern = lexicon::hash_lemmas$token, replacement = lexicon::hash_lemmas$lemma)
Compare the differences between tks_stem
and tks_lemma
:
tks_stem[1]
## Tokens consisting of 1 document and 2 docvars.
## text136751 :
## [1] "london" "masterclass" "on" "climat" "chang"
## [6] "|" "do" "you" "want" "to"
## [11] "understand" "more"
## [ ... and 56 more ]
tks_lemma[1]
## Tokens consisting of 1 document and 2 docvars.
## text136751 :
## [1] "london" "masterclass" "on" "climate" "change"
## [6] "|" "do" "you" "want" "to"
## [11] "understand" "much"
## [ ... and 56 more ]
# remove numbers and punctuation
tks = quanteda::tokens(tks_lemma, remove_numbers = TRUE, remove_punct = TRUE)
tks[[1]][1:10]
## [1] "london" "masterclass" "on" "climate" "change"
## [6] "|" "do" "you" "want" "to"
# tks[1]
tks = tokens_remove(tks,stopwords('english'),padding = F) # remove "" or not.
tks[[1]][1:10]
## [1] "london" "masterclass" "climate" "change" "|"
## [6] "want" "understand" "much" "climate" "change"
Question: how to remove non-English characters?
library(udpipe)
tokens = udpipe(docs$texts[1], 'english')
head(tokens)
## doc_id paragraph_id sentence_id sentence start
## 1 doc1 1 1 London masterclass on climate change | 1
## 2 doc1 1 1 London masterclass on climate change | 8
## 3 doc1 1 1 London masterclass on climate change | 20
## 4 doc1 1 1 London masterclass on climate change | 23
## 5 doc1 1 1 London masterclass on climate change | 31
## 6 doc1 1 1 London masterclass on climate change | 38
## end term_id token_id token lemma upos xpos feats
## 1 6 1 1 London London PROPN NNP Number=Sing
## 2 18 2 2 masterclass masterclass NOUN NN Number=Sing
## 3 21 3 3 on on ADP IN <NA>
## 4 29 4 4 climate climate NOUN NN Number=Sing
## 5 36 5 5 change change NOUN NN Number=Sing
## 6 38 6 6 | | SYM NFP <NA>
## head_token_id dep_rel deps misc
## 1 2 compound <NA> <NA>
## 2 0 root <NA> <NA>
## 3 5 case <NA> <NA>
## 4 5 compound <NA> <NA>
## 5 2 nmod <NA> <NA>
## 6 2 punct <NA> <NA>
library(rsyntax)
tokens = as_tokenindex(tokens)
head(tokens)
## doc_id paragraph_id sentence start end term_id token_id token
## 1: doc1 1 1 1 6 1 1 London
## 2: doc1 1 1 8 18 2 2 masterclass
## 3: doc1 1 1 20 21 3 3 on
## 4: doc1 1 1 23 29 4 4 climate
## 5: doc1 1 1 31 36 5 5 change
## 6: doc1 1 1 38 38 6 6 |
## lemma upos xpos feats parent relation deps misc
## 1: London PROPN NNP Number=Sing 2 compound <NA> <NA>
## 2: masterclass NOUN NN Number=Sing NA ROOT <NA> <NA>
## 3: on ADP IN <NA> 5 case <NA> <NA>
## 4: climate NOUN NN Number=Sing 5 compound <NA> <NA>
## 5: change NOUN NN Number=Sing 2 nmod <NA> <NA>
## 6: | SYM NFP <NA> 2 punct <NA> <NA>
## sentence_txt
## 1: London masterclass on climate change |
## 2: London masterclass on climate change |
## 3: London masterclass on climate change |
## 4: London masterclass on climate change |
## 5: London masterclass on climate change |
## 6: London masterclass on climate change |
plot_tree(tokens, token, lemma, upos) # sentence 1
## Document: doc1
## Sentence: 1
myStemMat <- dfm(corp_news,
remove = stopwords("english"),
stem = TRUE,
remove_numbers=TRUE,
remove_punct = TRUE)
dim(myStemMat)
## [1] 6000 68766
dim(dfm_remove(myStemMat,min_nchar=3))
## [1] 6000 68178
colnames(myStemMat)[1:20]
## [1] "london" "masterclass" "climat" "chang" "|"
## [6] "want" "understand" "march" "guardian" "assist"
## [11] "nation" "news" "editor" "jame" "randerson"
## [16] "give" "topic" "introduc" "scienc" "talk"
To lemmatize first and then create document-term matrix:
myStemMat <- corp_news %>% quanteda::tokens(remove_numbers=TRUE,remove_punct = TRUE) %>%
tokens_tolower() %>%
tokens_replace(pattern = lexicon::hash_lemmas$token, replacement = lexicon::hash_lemmas$lemma) %>%
tokens_remove(stopwords("english")) %>%
dfm()
dim(myStemMat)
## [1] 6000 80422
The most discriminant words between “The Guardian” and “The Guardian - Final Edition”:
word_list = textstat_keyness(myStemMat,target=docvars(corp_news,'pub')=="The Guardian",measure="chi2")
rbind(head(word_list,10),tail(word_list,10))
## feature chi2 p n_target n_reference
## 1 block-time 1435.9900 0 5831 0
## 2 published-time 1083.4907 0 4402 0
## 3 gmt 811.4200 0 3298 0
## 4 trump 763.4474 0 3265 18
## 5 relate 681.2053 0 3613 100
## 6 bst 531.6693 0 2171 1
## 7 | 493.4682 0 99917 20814
## 8 clinton 442.7953 0 1926 14
## 9 updated-timeupdated 351.3360 0 1429 0
## 10 sander 316.8706 0 1298 1
## 80413 barclays -183.4219 0 125 140
## 80414 film -187.8940 0 389 270
## 80415 murdoch -188.9138 0 24 73
## 80416 guardian.co.uk -195.3401 0 0 48
## 80417 london -218.2224 0 1948 867
## 80418 mandela -244.7324 0 21 85
## 80419 bank -414.5988 0 2569 1255
## 80420 olympic -554.6887 0 69 213
## 80421 yesterday -2013.4198 0 314 829
## 80422 caption -2572.4867 0 94 756
# Plot estimated word keyness
textplot_keyness(word_list)
dim(myStemMat)
## [1] 6000 80422
Simil = textstat_simil(myStemMat, margin = "documents", method = "cosine")
m=as.matrix(Simil)
m[1:5,1:5]
## text136751 text118588 text45146 text93623 text136585
## text136751 1.00000000 0.1350556 0.1714198 0.07271545 0.1496148
## text118588 0.13505561 1.0000000 0.4650005 0.36059785 0.4603097
## text45146 0.17141983 0.4650005 1.0000000 0.44703558 0.7488372
## text93623 0.07271545 0.3605978 0.4470356 1.00000000 0.4228094
## text136585 0.14961477 0.4603097 0.7488372 0.42280939 1.0000000
Similarity using matrix tf-idf:
Simil2 = textstat_simil(dfm_tfidf(myStemMat), margin = "documents", method = "cosine")
m2=as.matrix(Simil2)
m2[1:5,1:5]
## text136751 text118588 text45146 text93623 text136585
## text136751 1.0000000000 0.0042454987 0.0102009577 0.0002577826 0.009466104
## text118588 0.0042454987 1.0000000000 0.0006789199 0.0198308102 0.016307141
## text45146 0.0102009577 0.0006789199 1.0000000000 0.0044004712 0.003342253
## text93623 0.0002577826 0.0198308102 0.0044004712 1.0000000000 0.014253535
## text136585 0.0094661041 0.0163071414 0.0033422528 0.0142535354 1.000000000
Now, use tf-idf matrix to calculate the similarity scores:
rm = dfm_tfidf(myStemMat)
rm = convert(rm,to="matrix")
rm[1:5,1:5]
## features
## docs london masterclass climate change |
## text136751 1.3514351 6.60206 3.591679 1.4333196 0.005829544
## text118588 0.0000000 0.00000 0.000000 0.0000000 0.122420417
## text45146 0.0000000 0.00000 0.000000 0.9555464 0.151568135
## text93623 0.0000000 0.00000 0.000000 0.0000000 0.116590873
## text136585 0.6757175 0.00000 0.000000 1.4333196 0.623761172
library(lsa,quietly = T)
cosine(rm[3,],rm[4,])
## [,1]
## [1,] 0.004400471
Without weighting:
rm = convert(myStemMat,to="matrix")
cosine(rm[3,],rm[4,])
## [,1]
## [1,] 0.4470356
NOTE: Please use the tf-idf matrix to calculate cosine similarity!
#distance
d1 = as.matrix(textstat_dist(myStemMat,margin = 'documents'))
d1[1:5,1:5]
## text136751 text118588 text45146 text93623 text136585
## text136751 0.00000 43.42810 26.64583 43.71499 138.0543
## text118588 43.42810 0.00000 39.34463 49.46716 124.9600
## text45146 26.64583 39.34463 0.00000 39.76179 120.0791
## text93623 43.71499 49.46716 39.76179 0.00000 126.7912
## text136585 138.05434 124.95999 120.07914 126.79117 0.0000
d2 = as.matrix(textstat_dist(dfm_tfidf(myStemMat),margin = 'documents')) # a two-fold normalization
d2[1:5,1:5]
## text136751 text118588 text45146 text93623 text136585
## text136751 1.685900e-07 6.572436e+01 16.67947 63.33740 1.216859e+02
## text118588 6.572436e+01 2.336020e-06 66.37813 89.31393 1.367456e+02
## text45146 1.667947e+01 6.637813e+01 0.00000 63.92637 1.220717e+02
## text93623 6.333740e+01 8.931393e+01 63.92637 0.00000 1.357461e+02
## text136585 1.216859e+02 1.367456e+02 122.07171 135.74614 1.078959e-05
d3 = as.matrix(textstat_dist(dfm_weight(myStemMat,"prop"),margin = 'documents')) #normalized distance
d3[1:5,1:5]
## text136751 text118588 text45146 text93623 text136585
## text136751 0.0000000 0.22300495 4.639055e-01 0.2328035 2.222957e-01
## text118588 0.2230050 0.00000000 4.148990e-01 0.1095693 9.480282e-02
## text45146 0.4639055 0.41489901 1.290478e-08 0.4134846 3.848443e-01
## text93623 0.2328035 0.10956932 4.134846e-01 0.0000000 1.051565e-01
## text136585 0.2222957 0.09480282 3.848443e-01 0.1051565 2.180170e-08
head(kwic(corp_news, "london", window = 5, valuetype = "fixed"))
## Warning: 'kwic.corpus()' is deprecated. Use 'tokens()' first.
## Keyword-in-context with 6 matches.
## [text136751, 1] | London |
## [text136751, 36] masterclass on the topic in | London |
## [text136585, 1298] In an earlier tweet the | London |
## [text107174, 432] streets of Islington in north | London |
## [text31104, 197] levels of housing benefit in | London |
## [text173244, 17] boosted as the mayor of | London |
##
## masterclass on climate change |
## . He'll introduce the science
## mayoral candidate said:"
## to find out exactly how
## with a bedroom tax designed
## , Sadiq Khan, joined
Collocation:
toks_c <- quanteda::tokens(char_tolower(docs$texts[1:100]), remove_numbers=TRUE, remove_punct=TRUE)
toks_c <- quanteda::tokens_remove(toks_c, c(stopwords("english"),"|"))
colls <- textstat_collocations(toks_c, size = 2)
head(colls[order(colls$count, decreasing=TRUE),], n=10)
## collocation count count_nested length lambda
## 585 block-time published-time 86 0 2 12.150046
## 3 prime minister 42 0 2 8.847216
## 1074 block-time updated-timeupdated 37 0 2 10.478439
## 1 last year 32 0 2 5.406253
## 2 gmt block-time 30 0 2 5.369954
## 9 chief executive 19 0 2 9.147969
## 6 bernie sanders 18 0 2 7.449341
## 7 clinton says 17 0 2 5.250991
## 4 climate change 16 0 2 7.633862
## 5 david cameron 16 0 2 8.375022
## z
## 585 8.510381
## 3 21.142903
## 1074 7.339521
## 1 22.692982
## 2 22.016829
## 9 16.767894
## 6 17.517103
## 7 17.436683
## 4 17.860976
## 5 17.830078
# readability
fk <- textstat_readability(corp_news[1:100], "Flesch.Kincaid")
head(fk)
## document Flesch.Kincaid
## 1 text136751 9.121434
## 2 text118588 10.447437
## 3 text45146 27.386410
## 4 text93623 15.512288
## 5 text136585 15.835728
## 6 text65682 13.396630
# complexity
ld <- textstat_lexdiv(myStemMat[1:100,], "TTR")
head(ld)
## document TTR
## 1 text136751 0.8064516
## 2 text118588 0.6064516
## 3 text45146 0.8620690
## 4 text93623 0.6109726
## 5 text136585 0.5222701
## 6 text65682 0.6421405
library(boot)
doc = docs$texts[4]
tks = quanteda::tokens(doc,what='sentence')
data = unlist(tks)
foo <- function(data, indices){
dt<-paste(data[indices],collapse = " ")
corp_uk = corpus(dt)
fk <- textstat_readability(corp_uk, "Flesch.Kincaid")
fk$Flesch.Kincaid
}
#set.seed(12345)
myBootstrap <- boot(data, foo, R=100)
plot(myBootstrap)
boot.ci(myBootstrap)
## Warning in boot.ci(myBootstrap): bootstrap variances needed for studentized
## intervals
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 100 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = myBootstrap)
##
## Intervals :
## Level Normal Basic
## 95% (13.45, 17.68 ) (13.52, 18.02 )
##
## Level Percentile BCa
## 95% (13.01, 17.51 ) (13.46, 17.84 )
## Calculations and Intervals on Original Scale
## Some basic intervals may be unstable
## Some percentile intervals may be unstable
## Some BCa intervals may be unstable
Let’s test the significance of the difference between year 2015 and year 2016:
docA = docs$texts[docs$year==2015]
tksA = quanteda::tokens(docA,what='sentence')
dataA = tksA[[1]]
dataA = data.frame("text"=dataA,"year"=2015,stringsAsFactors = F)
docB = docs$texts[docs$year==2016]
tksB = quanteda::tokens(docB,what='sentence')
dataB = tksB[[2]]
dataB = data.frame("text"=dataB,"year"="2016",stringsAsFactors = F)
data = rbind(dataA,dataB)
foo <- function(data,indices){
sd = data[indices,]
dataA = sd[sd$year==2015,]
dtA<-paste(dataA$text,collapse = " ")
corp_A = quanteda::corpus(dtA)
fkA <- textstat_readability(corp_A, "Flesch.Kincaid")
dataB = sd[sd$year==2016,]
dtB<-paste(dataB$text,collapse = " ")
corp_B = quanteda::corpus(dtB)
fkB <- textstat_readability(corp_B, "Flesch.Kincaid")
fkA$Flesch.Kincaid - fkB$Flesch.Kincaid
}
#set.seed(12345)
myBootstrap <- boot(data, foo, R=100)
plot(myBootstrap)
myBootstrap
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = data, statistic = foo, R = 100)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -5.388291 -0.8218896 1.610336
There are several packages in r, such as word2vec and text2vec. We use word2vec
as an example. Download pre-trained binary vectors here and then using read.word2vec
to load the embeddings.
library(word2vec)
## Warning: package 'word2vec' was built under R version 4.1.2
path <- system.file(package = "word2vec", "models", "example.bin")
model <- read.word2vec(path)
emb <- as.matrix(model)
x <- emb[c("bus", "toilet"), ]
y <- emb
word2vec_similarity(x, x)
## bus toilet
## bus 0.5258894 0.3906689
## toilet 0.3906689 0.4896687
word2vec_similarity(x, y, top_n = 3)
## term1 term2 similarity rank
## 1 toilet gemakken 0.6743961 1
## 2 toilet voorzien 0.6610579 2
## 3 toilet badkamer 0.6447656 3
## 4 bus lopen 0.7657934 1
## 5 bus minuten 0.7428922 2
## 6 bus terug 0.7298404 3
predict(model, x, type = "nearest", top_n = 3)
## $bus
## term similarity rank
## 1 lopen 0.7657934 1
## 2 minuten 0.7428921 2
## 3 terug 0.7298404 3
##
## $toilet
## term similarity rank
## 1 gemakken 0.6743961 1
## 2 voorzien 0.6610579 2
## 3 badkamer 0.6447656 3
load("corp_news.Rdata")
corp_news <- corpus_subset(corp_news, date >= as.Date("2016-10-01"))
ndoc(corp_news)
## [1] 241
We can identify documents that are similar to one another based on the frequency of words, using similarity. There’s different metrics to compute similarity. Here we explore two of them: Jaccard distance and Cosine distance.
myStemMat <- corp_news %>% quanteda::tokens(remove_numbers=TRUE,remove_punct = TRUE) %>%
tokens_tolower() %>%
tokens_replace(pattern = lexicon::hash_lemmas$token, replacement = lexicon::hash_lemmas$lemma) %>%
tokens_remove(c(stopwords("english"),"|")) %>%
dfm()
dim(dfm_remove(myStemMat,min_nchar=3))
## [1] 241 12799
# compute distances
distances <- textstat_dist(dfm_weight(myStemMat,"prop"),margin = 'documents') #nromalized distance
as.matrix(distances)[1:5, 1:5]
## text184646 text181782 text182271 text180150 text181097
## text184646 7.213990e-09 0.1233829 0.1486713 0.1257329 0.1704505
## text181782 1.233829e-01 0.0000000 0.1279958 0.1074945 0.1537039
## text182271 1.486713e-01 0.1279958 0.0000000 0.1297388 0.1755431
## text180150 1.257329e-01 0.1074945 0.1297388 0.0000000 0.1599094
## text181097 1.704505e-01 0.1537039 0.1755431 0.1599094 0.0000000
# clustering
cluster <- hclust(as.dist(distances))
memb <- cutree(cluster, k = 10)
table(memb,docvars(myStemMat)$section)
## < table of extent 10 x 0 >
plot(cluster)
# draw dendogram with red borders around the 5 clusters
# rect.hclust(cluster, k=20, border="red")
A different type of clustering is principal component analysis. This technique will try to identify a set of uncorrelated variables that capture most of the variance in the document-term matrix. The first component will always capture the largest proportion of the variance; the second captures the second largest, etc. Looking at the relative proportion of the variance captured by the first component vs the rest, we can see to what extent we can reduce the data set to just one dimension.
# Principal components analysis
pca <- prcomp(t(as.matrix(dfm_tfidf(myStemMat))))
plot(pca) # first PC captures most of the variance
# looking at features for each PC
df <- data.frame(
featname = featnames(myStemMat),
dim1 = pca$x[,1],
dim2 = pca$x[,2],
stringsAsFactors=FALSE
)
head(df[order(df$dim1),])
## featname dim1 dim2
## landlord landlord -0.5962645 -0.5179666
## oyster oyster -0.5609731 -0.7185937
## tax tax -0.5395239 -0.3366366
## trump trump -0.5052204 -1.3510234
## mortgage mortgage -0.4836581 -0.2903064
## clinton clinton -0.4802620 -0.6980776
A similar dimensionality reduction technique is correspondence analysis.
out <- textmodel_ca(dfm_tfidf(myStemMat))
# documents
df <- data.frame(
docname = docnames(dfm_tfidf(myStemMat)),
dim1 = out$rowcoord[,1],
dim2 = out$rowcoord[,2],
stringsAsFactors=FALSE
)
head(df[order(df$dim1),])
## docname dim1 dim2
## text184962 text184962 -0.1854266 -0.02547941
## text184558 text184558 -0.1834734 -0.41581039
## text187736 text187736 -0.1834277 -0.24972103
## text183002 text183002 -0.1833802 -0.24423282
## text181360 text181360 -0.1830867 -0.31761739
## text183060 text183060 -0.1830693 0.11801157
# features
df <- data.frame(
featname = featnames(myStemMat),
dim1 = out$colcoord[,1],
dim2 = out$colcoord[,2],
stringsAsFactors=FALSE
)
head(df[order(df$dim1),])
## featname dim1 dim2
## satanic satanic -0.1908407 -0.03372606
## hypnosis hypnosis -0.1908407 -0.03372606
## hoskins hoskins -0.1908407 -0.03372606
## wiltshire's wiltshire's -0.1908407 -0.03372606
## macpherson macpherson -0.1908407 -0.03372606
## veale veale -0.1908407 -0.03372606
The general procedure:
library(topicmodels)
quantdfm <- corp_news %>% quanteda::tokens(remove_numbers=TRUE,remove_punct = TRUE) %>%
tokens_tolower() %>%
tokens_replace(pattern = lexicon::hash_lemmas$token, replacement = lexicon::hash_lemmas$lemma) %>%
tokens_remove(c(stopwords("english"),"|")) %>%
dfm() %>%
dfm_trim(min_termfreq = 4, max_docfreq = 10)
K=10
mtx = convert(quantdfm, to = "topicmodels")
lda <- LDA(mtx, k = K, method = "Gibbs",
control = list(verbose=0, seed = 123, burnin = 100, iter = 500,keep=50))
get_terms(lda, 10)
## Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6
## [1,] "landlord" "immigrant" "mine" "protester" "sleep" "ticket"
## [2,] "visa" "castro" "alcohol" "taylor" "coal" "heathrow"
## [3,] "labor" "cuba" "peace" "sex" "building" "berlin"
## [4,] "turnbull" "cuban" "taliban" "bully" "ireland" "urban"
## [5,] "rend" "african" "resolution" "king" "beach" "rail"
## [6,] "credit" "latin" "drink" "center" "emission" "airport"
## [7,] "mortgage" "permit" "pakistan" "abrini" "warehouse" "isis"
## [8,] "gun" "revolution" "export" "stein" "resort" "christian"
## [9,] "hammond" "vehicle" "spin-dry" "session" "tech" "injure"
## [10,] "wealth" "castro's" "guarantee" "jury" "purchase" "drone"
## Topic 7 Topic 8 Topic 9 Topic 10
## [1,] "oyster" "google" "nhs" "de"
## [2,] "plastic" "app" "girl" "la"
## [3,] "waste" "syrian" "ukip" "el"
## [4,] "bed" "window" "fbi" "en"
## [5,] "island" "aleppo" "corbyn" "los"
## [6,] "elephant" "digital" "indigenous" "n"
## [7,] "bird" "machine" "nice" "que"
## [8,] "animal" "lewis" "weiner" "y"
## [9,] "recycle" "box" "honour" "census"
## [10,] "tower" "productivity" "breast" "las"
topics <- get_topics(lda, 1)
head(topics)
## text184646 text181782 text182271 text180150 text181097 text183267
## 9 4 6 6 7 4
Two indicators for model fit:
pp = perplexity(lda,convert(quantdfm, to = "topicmodels"))
ll = logLik(lda)
print(pp)
## [1] 1056.495
print(ll)
## 'log Lik.' -140953.5 (df=26620)
lda@logLiks[-c(1:(100/50))]
## [1] -141077.4 -140655.7 -140802.3 -140962.2 -140834.0 -140620.9 -141036.6
## [8] -140654.1 -140594.2 -140953.5
Select K based on pp and ll:
burnin = 1000
iter = 1000
keep = 50
# generate numerous topic models with different numbers of topics
sequ <- seq(0, 20, 5)[2:5] # in this case a sequence of numbers from 1 to 50, by ones.
fitted_many <- lapply(sequ, function(k) LDA(mtx, k = k, method = "Gibbs",
control = list(verbose=0,burnin = burnin, iter = iter, keep = keep)))
#save(fitted_many,file='TopicModels_f.Rdata')
Extract logliks from each topic
logLiks_many <- lapply(fitted_many, function(L) L@logLiks[-c(1:(burnin/keep))])
# compute harmonic means
# install.packages('psych')
library(psych)
hm_many <- sapply(logLiks_many, function(h) harmonic.mean(h))
# inspect
plot(sequ, hm_many, type = "l")
# compute optimum number of topics 15
sequ[which.max(hm_many)]
## [1] 20
final <- fitted_many[[3]]
get_terms(final,10)
## Topic 1 Topic 2 Topic 3 Topic 4 Topic 5
## [1,] "immigrant" "honour" "girl" "ukip" "landlord"
## [2,] "lewis" "taylor" "warehouse" "visa" "rend"
## [3,] "productivity" "hong" "resort" "labor" "credit"
## [4,] "rosser" "gender" "vaccine" "corbyn" "mortgage"
## [5,] "heathrow" "center" "elephant" "gun" "resolution"
## [6,] "airport" "tower" "privacy" "turnbull" "wealth"
## [7,] "register" "manchester" "bird" "shortlist" "rail"
## [8,] "rogers" "actor" "winter" "canada" "beach"
## [9,] "document" "lgbt" "enforcement" "shorten" "rent"
## [10,] "expansion" "beijing" "wildlife" "exit" "borrow"
## Topic 6 Topic 7 Topic 8 Topic 9 Topic 10 Topic 11
## [1,] "de" "syrian" "google" "app" "oyster" "peace"
## [2,] "la" "aleppo" "sleep" "ticket" "bed" "taliban"
## [3,] "el" "sex" "building" "window" "bully" "pakistan"
## [4,] "en" "census" "machine" "digital" "island" "christian"
## [5,] "los" "isis" "ireland" "fake" "abortion" "russia"
## [6,] "alcohol" "humanitarian" "tech" "content" "tassal" "saudi"
## [7,] "n" "abrini" "samsung" "seize" "pub" "supreme"
## [8,] "que" "port" "pixel" "update" "clinic" "hunt"
## [9,] "pilot" "jury" "irish" "microsoft" "bay" "arabia"
## [10,] "drink" "prosecution" "smartphones" "wi-fi" "piece" "germany"
## Topic 12 Topic 13 Topic 14 Topic 15
## [1,] "fbi" "nhs" "castro" "plastic"
## [2,] "indigenous" "nice" "cuba" "mine"
## [3,] "weiner" "breast" "protester" "coal"
## [4,] "sander" "hammond" "cuban" "emission"
## [5,] "bureau" "export" "latin" "berlin"
## [6,] "podesta" "healthcare" "castro's" "waste"
## [7,] "nominee" "uber" "african" "urban"
## [8,] "comey" "permit" "revolution" "injure"
## [9,] "romney" "vehicle" "soviet" "recycle"
## [10,] "aboriginal" "medicine" "revolutionary" "poverty"
Select K: 5-fold cross-validation, different numbers of topics
library(cvTools)
library(dplyr)
# dtm is the doc-term matrix (convert first)
cvLDA <- function(Ntopics,dtm,K=5) {
folds<-cvFolds(nrow(dtm),K,1)
perplex <- rep(NA,K)
llk <- rep(NA,K)
for(i in unique(folds$which)){
cat(i, " ")
which.test <- folds$subsets[folds$which==i]
which.train <- {1:nrow(dtm)}[-which.test]
dtm.train <- dtm[which.train,]
dtm.test <- dtm[which.test,]
lda.fit <- LDA(dtm.train, k=Ntopics, method="Gibbs",control=list(verbose=0, iter=100))
perplex[i] <- perplexity(lda.fit, dtm.test)
llk[i] <- logLik(lda.fit)
}
res = data.frame(n=rep(Ntopics,K),perplexity=perplex,logLik=llk)
return(res)
}
# now test:
N <- c(20, 30, 40, 50, 60, 70)
results <- data.frame()
for (n in N){
cat("\n\n\n##########\n ", n, "topics", "\n")
res <- cvLDA(n, mtx)
results <- bind_rows(results,res)
}
##
##
##
## ##########
## 20 topics
## 1 2 3 4 5
##
##
## ##########
## 30 topics
## 1 2 3 4 5
##
##
## ##########
## 40 topics
## 1 2 3 4 5
##
##
## ##########
## 50 topics
## 1 2 3 4 5
##
##
## ##########
## 60 topics
## 1 2 3 4 5
##
##
## ##########
## 70 topics
## 1 2 3 4 5
results$ratio_perp1 <- results$perplexity / max(results$perplexity)
results$ratio_lk1 <- results$logLik / min(results$logLik)
df <- results %>% group_by(n) %>%
summarise(ratio_perp=mean(ratio_perp1),sd_perp=sd(ratio_perp1),
ratio_lk=mean(ratio_lk1),sd_lk=sd(ratio_lk1))
df
## # A tibble: 6 x 5
## n ratio_perp sd_perp ratio_lk sd_lk
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 20 0.915 0.0583 0.970 0.0282
## 2 30 0.834 0.0398 0.964 0.0258
## 3 40 0.779 0.0419 0.963 0.0134
## 4 50 0.731 0.0451 0.963 0.0213
## 5 60 0.711 0.0262 0.967 0.0310
## 6 70 0.709 0.0622 0.967 0.0226
pd = df[,1:3]
colnames(pd)=c('n','value',"sd")
pd$variable = "Perplexity"
df$variable = "LogLikelihood"
colnames(df)[4:5]=c('value',"sd")
pd = bind_rows(pd,df[,c(1,4:6)])
pd
## # A tibble: 12 x 4
## n value sd variable
## <dbl> <dbl> <dbl> <chr>
## 1 20 0.915 0.0583 Perplexity
## 2 30 0.834 0.0398 Perplexity
## 3 40 0.779 0.0419 Perplexity
## 4 50 0.731 0.0451 Perplexity
## 5 60 0.711 0.0262 Perplexity
## 6 70 0.709 0.0622 Perplexity
## 7 20 0.970 0.0282 LogLikelihood
## 8 30 0.964 0.0258 LogLikelihood
## 9 40 0.963 0.0134 LogLikelihood
## 10 50 0.963 0.0213 LogLikelihood
## 11 60 0.967 0.0310 LogLikelihood
## 12 70 0.967 0.0226 LogLikelihood
library(ggplot2)
library(grid)
p <- ggplot(pd, aes(x=n, y=value, linetype=variable))
pq <- p + geom_line() + geom_point(aes(shape=variable),
fill="white", shape=21, size=1.40) +
geom_errorbar(aes(ymax=value+sd, ymin=value-sd), width=4) +
scale_y_continuous("Ratio wrt worst value") +
scale_x_continuous("Number of topics",
breaks=K) +
theme_bw()
pq