1. Pre-Processing

Convert texts to a corpus. We can use several packages in R: tm, quanteda, etc. Here we introduce quanteda, which is more convenient for our tasks:

# load library and data
library(quanteda,quietly = T)
library(quanteda.textstats) # keyness readability etc.
library(quanteda.textplots)
library(quanteda.textmodels) # pca/ca/lda
library(lubridate) # time processing

1.1. quanteda package

To convert text documents to corpus using qunateda::corpus():

# devtools::install_github("quanteda/quanteda.corpora")
# library(quanteda.corpora)
# corp_news <- download('data_corpus_guardian')

load("docs.Rdata")
docs$year = year(docs$date)

# texts -> corpus
corp_news = corpus(docs$texts) # the input could be a data.frame (set text_field)

# set the meta variable party & year:
docvars(corp_news, "pub") <- docs$pub
docvars(corp_news, "year") <- docs$year
summary(corp_news[1:10])
## Corpus consisting of 10 documents, showing 10 documents:
## 
##    Text Types Tokens Sentences                          pub year
##   text1    51     68         4                 The Guardian 2016
##   text2   413   1052        41                 The Guardian 2015
##   text3    49     83         1                 The Guardian 2014
##   text4   348    866        27                 The Guardian 2015
##   text5   986   2768        75                 The Guardian 2016
##   text6   522   1271        43                 The Guardian 2014
##   text7   551   1505        56                 The Guardian 2015
##   text8   245    544        23 The Guardian - Final Edition 2013
##   text9   158    260         6 The Guardian - Final Edition 2013
##  text10   424   1030        37                 The Guardian 2016
summary(corpus_subset(corp_news,year==2016)[1:10])
## Corpus consisting of 10 documents, showing 10 documents:
## 
##    Text Types Tokens Sentences          pub year
##   text1    51     68         4 The Guardian 2016
##   text5   986   2768        75 The Guardian 2016
##  text10   424   1030        37 The Guardian 2016
##  text11   408   1111        45 The Guardian 2016
##  text13   556   1261        39 The Guardian 2016
##  text15   518   1384        53 The Guardian 2016
##  text17   513   1128        33 The Guardian 2016
##  text19   324    683        19 The Guardian 2016
##  text25   300    734        21 The Guardian 2016
##  text26   232    407        16 The Guardian 2016
corp_news = corpus(docs[,c("texts","pub","year")],text_field = "texts")
summary(corp_news[1:10])
## Corpus consisting of 10 documents, showing 10 documents:
## 
##        Text Types Tokens Sentences                          pub year
##  text136751    51     68         4                 The Guardian 2016
##  text118588   413   1052        41                 The Guardian 2015
##   text45146    49     83         1                 The Guardian 2014
##   text93623   348    866        27                 The Guardian 2015
##  text136585   986   2768        75                 The Guardian 2016
##   text65682   522   1271        43                 The Guardian 2014
##  text107174   551   1505        56                 The Guardian 2015
##   text22792   245    544        23 The Guardian - Final Edition 2013
##   text32425   158    260         6 The Guardian - Final Edition 2013
##  text139163   424   1030        37                 The Guardian 2016
docvars(corp_news,"pub")[1:10]
##  [1] "The Guardian"                 "The Guardian"                
##  [3] "The Guardian"                 "The Guardian"                
##  [5] "The Guardian"                 "The Guardian"                
##  [7] "The Guardian"                 "The Guardian - Final Edition"
##  [9] "The Guardian - Final Edition" "The Guardian"

To check the first document:

as.character(corp_news[1])
##                                                                                                                                                                                                                                                                                                                                                                          text136751 
## "London masterclass on climate change | Do you want to understand more about climate change? On 14 March the Guardian's assistant national news editor James Randerson is giving a masterclass on the topic in London. He'll introduce the science and talk about predictions for what impact the changing climate will have - and discuss what we can do about it. And finally..."

To convert documents to tokens:

# txt -> tokens
A = quanteda::tokens(docs$texts[1])
A[[1]][1:20]
##  [1] "London"      "masterclass" "on"          "climate"     "change"     
##  [6] "|"           "Do"          "you"         "want"        "to"         
## [11] "understand"  "more"        "about"       "climate"     "change"     
## [16] "?"           "On"          "14"          "March"       "the"
# corpus -> tokens
B = quanteda::tokens(corp_news[1])
B[[1]][1:20]
##  [1] "London"      "masterclass" "on"          "climate"     "change"     
##  [6] "|"           "Do"          "you"         "want"        "to"         
## [11] "understand"  "more"        "about"       "climate"     "change"     
## [16] "?"           "On"          "14"          "March"       "the"

Preprocess the tokens:

# convert to lower cases
tks_lower = tokens_tolower(quanteda::tokens(corp_news))

# steming
tks_stem = tokens_wordstem(tks_lower)
# or lemmatization [textstem::lemmatize_strings]
tks_lemma = tokens_replace(tks_lower, pattern = lexicon::hash_lemmas$token, replacement = lexicon::hash_lemmas$lemma)

Compare the differences between tks_stem and tks_lemma:

tks_stem[1]
## Tokens consisting of 1 document and 2 docvars.
## text136751 :
##  [1] "london"      "masterclass" "on"          "climat"      "chang"      
##  [6] "|"           "do"          "you"         "want"        "to"         
## [11] "understand"  "more"       
## [ ... and 56 more ]
tks_lemma[1]
## Tokens consisting of 1 document and 2 docvars.
## text136751 :
##  [1] "london"      "masterclass" "on"          "climate"     "change"     
##  [6] "|"           "do"          "you"         "want"        "to"         
## [11] "understand"  "much"       
## [ ... and 56 more ]
# remove numbers and punctuation
tks = quanteda::tokens(tks_lemma, remove_numbers = TRUE,  remove_punct = TRUE)
tks[[1]][1:10]
##  [1] "london"      "masterclass" "on"          "climate"     "change"     
##  [6] "|"           "do"          "you"         "want"        "to"
# tks[1]
tks = tokens_remove(tks,stopwords('english'),padding = F) # remove "" or not.
tks[[1]][1:10]
##  [1] "london"      "masterclass" "climate"     "change"      "|"          
##  [6] "want"        "understand"  "much"        "climate"     "change"

Question: how to remove non-English characters?

1.2. Syntactic Analysis

library(udpipe)
tokens = udpipe(docs$texts[1], 'english')
head(tokens)
##   doc_id paragraph_id sentence_id                               sentence start
## 1   doc1            1           1 London masterclass on climate change |     1
## 2   doc1            1           1 London masterclass on climate change |     8
## 3   doc1            1           1 London masterclass on climate change |    20
## 4   doc1            1           1 London masterclass on climate change |    23
## 5   doc1            1           1 London masterclass on climate change |    31
## 6   doc1            1           1 London masterclass on climate change |    38
##   end term_id token_id       token       lemma  upos xpos       feats
## 1   6       1        1      London      London PROPN  NNP Number=Sing
## 2  18       2        2 masterclass masterclass  NOUN   NN Number=Sing
## 3  21       3        3          on          on   ADP   IN        <NA>
## 4  29       4        4     climate     climate  NOUN   NN Number=Sing
## 5  36       5        5      change      change  NOUN   NN Number=Sing
## 6  38       6        6           |           |   SYM  NFP        <NA>
##   head_token_id  dep_rel deps misc
## 1             2 compound <NA> <NA>
## 2             0     root <NA> <NA>
## 3             5     case <NA> <NA>
## 4             5 compound <NA> <NA>
## 5             2     nmod <NA> <NA>
## 6             2    punct <NA> <NA>
library(rsyntax)
tokens = as_tokenindex(tokens)
head(tokens)
##    doc_id paragraph_id sentence start end term_id token_id       token
## 1:   doc1            1        1     1   6       1        1      London
## 2:   doc1            1        1     8  18       2        2 masterclass
## 3:   doc1            1        1    20  21       3        3          on
## 4:   doc1            1        1    23  29       4        4     climate
## 5:   doc1            1        1    31  36       5        5      change
## 6:   doc1            1        1    38  38       6        6           |
##          lemma  upos xpos       feats parent relation deps misc
## 1:      London PROPN  NNP Number=Sing      2 compound <NA> <NA>
## 2: masterclass  NOUN   NN Number=Sing     NA     ROOT <NA> <NA>
## 3:          on   ADP   IN        <NA>      5     case <NA> <NA>
## 4:     climate  NOUN   NN Number=Sing      5 compound <NA> <NA>
## 5:      change  NOUN   NN Number=Sing      2     nmod <NA> <NA>
## 6:           |   SYM  NFP        <NA>      2    punct <NA> <NA>
##                              sentence_txt
## 1: London masterclass on climate change |
## 2: London masterclass on climate change |
## 3: London masterclass on climate change |
## 4: London masterclass on climate change |
## 5: London masterclass on climate change |
## 6: London masterclass on climate change |
plot_tree(tokens, token, lemma, upos) # sentence 1
## Document: doc1
## Sentence: 1

Syntax Tree

2. Vector Space Model

2.1. Document-Term Matrix

myStemMat <- dfm(corp_news, 
                 remove = stopwords("english"), 
                 stem = TRUE, 
                 remove_numbers=TRUE,
                 remove_punct = TRUE)
dim(myStemMat)
## [1]  6000 68766
dim(dfm_remove(myStemMat,min_nchar=3))
## [1]  6000 68178
colnames(myStemMat)[1:20]
##  [1] "london"      "masterclass" "climat"      "chang"       "|"          
##  [6] "want"        "understand"  "march"       "guardian"    "assist"     
## [11] "nation"      "news"        "editor"      "jame"        "randerson"  
## [16] "give"        "topic"       "introduc"    "scienc"      "talk"

To lemmatize first and then create document-term matrix:

myStemMat <- corp_news %>% quanteda::tokens(remove_numbers=TRUE,remove_punct = TRUE) %>% 
  tokens_tolower() %>%
  tokens_replace(pattern = lexicon::hash_lemmas$token, replacement = lexicon::hash_lemmas$lemma) %>%
  tokens_remove(stopwords("english")) %>% 
  dfm()
dim(myStemMat)
## [1]  6000 80422

2.2. Keywords Selection

The most discriminant words between “The Guardian” and “The Guardian - Final Edition”:

word_list = textstat_keyness(myStemMat,target=docvars(corp_news,'pub')=="The Guardian",measure="chi2")
rbind(head(word_list,10),tail(word_list,10))
##                   feature       chi2 p n_target n_reference
## 1              block-time  1435.9900 0     5831           0
## 2          published-time  1083.4907 0     4402           0
## 3                     gmt   811.4200 0     3298           0
## 4                   trump   763.4474 0     3265          18
## 5                  relate   681.2053 0     3613         100
## 6                     bst   531.6693 0     2171           1
## 7                       |   493.4682 0    99917       20814
## 8                 clinton   442.7953 0     1926          14
## 9     updated-timeupdated   351.3360 0     1429           0
## 10                 sander   316.8706 0     1298           1
## 80413            barclays  -183.4219 0      125         140
## 80414                film  -187.8940 0      389         270
## 80415             murdoch  -188.9138 0       24          73
## 80416      guardian.co.uk  -195.3401 0        0          48
## 80417              london  -218.2224 0     1948         867
## 80418             mandela  -244.7324 0       21          85
## 80419                bank  -414.5988 0     2569        1255
## 80420             olympic  -554.6887 0       69         213
## 80421           yesterday -2013.4198 0      314         829
## 80422             caption -2572.4867 0       94         756
# Plot estimated word keyness
textplot_keyness(word_list) 

2.3. Distance & Similarity

dim(myStemMat)
## [1]  6000 80422
Simil = textstat_simil(myStemMat, margin = "documents", method = "cosine")
m=as.matrix(Simil)
m[1:5,1:5]
##            text136751 text118588 text45146  text93623 text136585
## text136751 1.00000000  0.1350556 0.1714198 0.07271545  0.1496148
## text118588 0.13505561  1.0000000 0.4650005 0.36059785  0.4603097
## text45146  0.17141983  0.4650005 1.0000000 0.44703558  0.7488372
## text93623  0.07271545  0.3605978 0.4470356 1.00000000  0.4228094
## text136585 0.14961477  0.4603097 0.7488372 0.42280939  1.0000000

Similarity using matrix tf-idf:

Simil2 = textstat_simil(dfm_tfidf(myStemMat), margin = "documents", method = "cosine")
m2=as.matrix(Simil2)
m2[1:5,1:5]
##              text136751   text118588    text45146    text93623  text136585
## text136751 1.0000000000 0.0042454987 0.0102009577 0.0002577826 0.009466104
## text118588 0.0042454987 1.0000000000 0.0006789199 0.0198308102 0.016307141
## text45146  0.0102009577 0.0006789199 1.0000000000 0.0044004712 0.003342253
## text93623  0.0002577826 0.0198308102 0.0044004712 1.0000000000 0.014253535
## text136585 0.0094661041 0.0163071414 0.0033422528 0.0142535354 1.000000000

Now, use tf-idf matrix to calculate the similarity scores:

rm = dfm_tfidf(myStemMat)
rm = convert(rm,to="matrix")
rm[1:5,1:5]
##             features
## docs            london masterclass  climate    change           |
##   text136751 1.3514351     6.60206 3.591679 1.4333196 0.005829544
##   text118588 0.0000000     0.00000 0.000000 0.0000000 0.122420417
##   text45146  0.0000000     0.00000 0.000000 0.9555464 0.151568135
##   text93623  0.0000000     0.00000 0.000000 0.0000000 0.116590873
##   text136585 0.6757175     0.00000 0.000000 1.4333196 0.623761172
library(lsa,quietly = T)
cosine(rm[3,],rm[4,])
##             [,1]
## [1,] 0.004400471

Without weighting:

rm = convert(myStemMat,to="matrix")
cosine(rm[3,],rm[4,])
##           [,1]
## [1,] 0.4470356

NOTE: Please use the tf-idf matrix to calculate cosine similarity!

#distance
d1 = as.matrix(textstat_dist(myStemMat,margin = 'documents'))
d1[1:5,1:5]
##            text136751 text118588 text45146 text93623 text136585
## text136751    0.00000   43.42810  26.64583  43.71499   138.0543
## text118588   43.42810    0.00000  39.34463  49.46716   124.9600
## text45146    26.64583   39.34463   0.00000  39.76179   120.0791
## text93623    43.71499   49.46716  39.76179   0.00000   126.7912
## text136585  138.05434  124.95999 120.07914 126.79117     0.0000
d2 = as.matrix(textstat_dist(dfm_tfidf(myStemMat),margin = 'documents')) # a two-fold normalization
d2[1:5,1:5]
##              text136751   text118588 text45146 text93623   text136585
## text136751 1.685900e-07 6.572436e+01  16.67947  63.33740 1.216859e+02
## text118588 6.572436e+01 2.336020e-06  66.37813  89.31393 1.367456e+02
## text45146  1.667947e+01 6.637813e+01   0.00000  63.92637 1.220717e+02
## text93623  6.333740e+01 8.931393e+01  63.92637   0.00000 1.357461e+02
## text136585 1.216859e+02 1.367456e+02 122.07171 135.74614 1.078959e-05
d3 = as.matrix(textstat_dist(dfm_weight(myStemMat,"prop"),margin = 'documents')) #normalized distance
d3[1:5,1:5]
##            text136751 text118588    text45146 text93623   text136585
## text136751  0.0000000 0.22300495 4.639055e-01 0.2328035 2.222957e-01
## text118588  0.2230050 0.00000000 4.148990e-01 0.1095693 9.480282e-02
## text45146   0.4639055 0.41489901 1.290478e-08 0.4134846 3.848443e-01
## text93623   0.2328035 0.10956932 4.134846e-01 0.0000000 1.051565e-01
## text136585  0.2222957 0.09480282 3.848443e-01 0.1051565 2.180170e-08

2.4. KWIC (Key Words In Context) & Collocations

head(kwic(corp_news, "london", window = 5, valuetype = "fixed"))
## Warning: 'kwic.corpus()' is deprecated. Use 'tokens()' first.
## Keyword-in-context with 6 matches.                                                            
##     [text136751, 1]                               | London |
##    [text136751, 36]   masterclass on the topic in | London |
##  [text136585, 1298]       In an earlier tweet the | London |
##   [text107174, 432] streets of Islington in north | London |
##    [text31104, 197]  levels of housing benefit in | London |
##    [text173244, 17]       boosted as the mayor of | London |
##                                 
##  masterclass on climate change |
##  . He'll introduce the science  
##  mayoral candidate said:"       
##  to find out exactly how        
##  with a bedroom tax designed    
##  , Sadiq Khan, joined

Collocation:

toks_c <- quanteda::tokens(char_tolower(docs$texts[1:100]), remove_numbers=TRUE, remove_punct=TRUE)
toks_c <- quanteda::tokens_remove(toks_c, c(stopwords("english"),"|"))
colls <- textstat_collocations(toks_c, size = 2)
head(colls[order(colls$count, decreasing=TRUE),], n=10)
##                         collocation count count_nested length    lambda
## 585       block-time published-time    86            0      2 12.150046
## 3                    prime minister    42            0      2  8.847216
## 1074 block-time updated-timeupdated    37            0      2 10.478439
## 1                         last year    32            0      2  5.406253
## 2                    gmt block-time    30            0      2  5.369954
## 9                   chief executive    19            0      2  9.147969
## 6                    bernie sanders    18            0      2  7.449341
## 7                      clinton says    17            0      2  5.250991
## 4                    climate change    16            0      2  7.633862
## 5                     david cameron    16            0      2  8.375022
##              z
## 585   8.510381
## 3    21.142903
## 1074  7.339521
## 1    22.692982
## 2    22.016829
## 9    16.767894
## 6    17.517103
## 7    17.436683
## 4    17.860976
## 5    17.830078

2.5. Diveristy and Complexity

# readability
fk <- textstat_readability(corp_news[1:100], "Flesch.Kincaid")
head(fk)
##     document Flesch.Kincaid
## 1 text136751       9.121434
## 2 text118588      10.447437
## 3  text45146      27.386410
## 4  text93623      15.512288
## 5 text136585      15.835728
## 6  text65682      13.396630
# complexity
ld <- textstat_lexdiv(myStemMat[1:100,], "TTR")
head(ld)
##     document       TTR
## 1 text136751 0.8064516
## 2 text118588 0.6064516
## 3  text45146 0.8620690
## 4  text93623 0.6109726
## 5 text136585 0.5222701
## 6  text65682 0.6421405

2.6. Hypothesis Testing (Bootstrapping)

library(boot)

doc = docs$texts[4]
tks = quanteda::tokens(doc,what='sentence')
data = unlist(tks)

foo <- function(data, indices){
  dt<-paste(data[indices],collapse = " ")
  corp_uk = corpus(dt)
  fk <- textstat_readability(corp_uk, "Flesch.Kincaid")
  fk$Flesch.Kincaid
}

#set.seed(12345)
myBootstrap <- boot(data, foo, R=100)
plot(myBootstrap)

boot.ci(myBootstrap)
## Warning in boot.ci(myBootstrap): bootstrap variances needed for studentized
## intervals
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 100 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = myBootstrap)
## 
## Intervals : 
## Level      Normal              Basic         
## 95%   (13.45, 17.68 )   (13.52, 18.02 )  
## 
## Level     Percentile            BCa          
## 95%   (13.01, 17.51 )   (13.46, 17.84 )  
## Calculations and Intervals on Original Scale
## Some basic intervals may be unstable
## Some percentile intervals may be unstable
## Some BCa intervals may be unstable

Let’s test the significance of the difference between year 2015 and year 2016:

docA = docs$texts[docs$year==2015]
tksA = quanteda::tokens(docA,what='sentence')
dataA = tksA[[1]]
dataA = data.frame("text"=dataA,"year"=2015,stringsAsFactors = F)

docB = docs$texts[docs$year==2016]
tksB = quanteda::tokens(docB,what='sentence')
dataB = tksB[[2]]
dataB = data.frame("text"=dataB,"year"="2016",stringsAsFactors = F)

data = rbind(dataA,dataB)

foo <- function(data,indices){
  sd = data[indices,]
  
  dataA = sd[sd$year==2015,]
  dtA<-paste(dataA$text,collapse = " ")
  corp_A = quanteda::corpus(dtA)
  fkA <- textstat_readability(corp_A, "Flesch.Kincaid")
  
  dataB = sd[sd$year==2016,]
  dtB<-paste(dataB$text,collapse = " ")
  corp_B = quanteda::corpus(dtB)
  fkB <- textstat_readability(corp_B, "Flesch.Kincaid")
  
  fkA$Flesch.Kincaid - fkB$Flesch.Kincaid
}

#set.seed(12345)
myBootstrap <- boot(data, foo, R=100)
plot(myBootstrap)

myBootstrap
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = data, statistic = foo, R = 100)
## 
## 
## Bootstrap Statistics :
##      original     bias    std. error
## t1* -5.388291 -0.8218896    1.610336

2.7. Word Embeddings

There are several packages in r, such as word2vec and text2vec. We use word2vec as an example. Download pre-trained binary vectors here and then using read.word2vec to load the embeddings.

library(word2vec)
## Warning: package 'word2vec' was built under R version 4.1.2
path <- system.file(package = "word2vec", "models", "example.bin")
model <- read.word2vec(path)
emb <- as.matrix(model)
x <- emb[c("bus", "toilet"), ]
y <- emb
word2vec_similarity(x, x)
##              bus    toilet
## bus    0.5258894 0.3906689
## toilet 0.3906689 0.4896687
word2vec_similarity(x, y, top_n = 3)
##    term1    term2 similarity rank
## 1 toilet gemakken  0.6743961    1
## 2 toilet voorzien  0.6610579    2
## 3 toilet badkamer  0.6447656    3
## 4    bus    lopen  0.7657934    1
## 5    bus  minuten  0.7428922    2
## 6    bus    terug  0.7298404    3
predict(model, x, type = "nearest", top_n = 3)
## $bus
##      term similarity rank
## 1   lopen  0.7657934    1
## 2 minuten  0.7428921    2
## 3   terug  0.7298404    3
## 
## $toilet
##       term similarity rank
## 1 gemakken  0.6743961    1
## 2 voorzien  0.6610579    2
## 3 badkamer  0.6447656    3

3. Unsupervised Learning

load("corp_news.Rdata")
corp_news <- corpus_subset(corp_news, date >= as.Date("2016-10-01"))
ndoc(corp_news)
## [1] 241

3.1. Clustering

We can identify documents that are similar to one another based on the frequency of words, using similarity. There’s different metrics to compute similarity. Here we explore two of them: Jaccard distance and Cosine distance.

myStemMat <- corp_news %>% quanteda::tokens(remove_numbers=TRUE,remove_punct = TRUE) %>% 
  tokens_tolower() %>%
  tokens_replace(pattern = lexicon::hash_lemmas$token, replacement = lexicon::hash_lemmas$lemma) %>%
  tokens_remove(c(stopwords("english"),"|")) %>% 
  dfm()
dim(dfm_remove(myStemMat,min_nchar=3))
## [1]   241 12799
# compute distances
distances <- textstat_dist(dfm_weight(myStemMat,"prop"),margin = 'documents') #nromalized distance
as.matrix(distances)[1:5, 1:5]
##              text184646 text181782 text182271 text180150 text181097
## text184646 7.213990e-09  0.1233829  0.1486713  0.1257329  0.1704505
## text181782 1.233829e-01  0.0000000  0.1279958  0.1074945  0.1537039
## text182271 1.486713e-01  0.1279958  0.0000000  0.1297388  0.1755431
## text180150 1.257329e-01  0.1074945  0.1297388  0.0000000  0.1599094
## text181097 1.704505e-01  0.1537039  0.1755431  0.1599094  0.0000000
# clustering
cluster <- hclust(as.dist(distances))
memb <- cutree(cluster, k = 10)
table(memb,docvars(myStemMat)$section)
## < table of extent 10 x 0 >
plot(cluster)

# draw dendogram with red borders around the 5 clusters 
# rect.hclust(cluster, k=20, border="red")

3.2. Dimensionality Reduction (PCA+CA)

A different type of clustering is principal component analysis. This technique will try to identify a set of uncorrelated variables that capture most of the variance in the document-term matrix. The first component will always capture the largest proportion of the variance; the second captures the second largest, etc. Looking at the relative proportion of the variance captured by the first component vs the rest, we can see to what extent we can reduce the data set to just one dimension.

# Principal components analysis
pca <- prcomp(t(as.matrix(dfm_tfidf(myStemMat)))) 
plot(pca) # first PC captures most of the variance