1. Basic Manipulation & Metrics

1.1. Network objects

We will use the SNA data, which were collected form a US discussion forum. The rows indicate replies between users. All users have self-reported political ideology. The data were stored in edgelist format.

load("SNA.Rdata")
nodes = SNA[,c("sender","leanR")]
nodes = nodes[!duplicated(nodes$sender),] #1178 users
table(nodes$leanR)
## 
## central    left   right 
##     559     261     358
library(dplyr)
links = SNA %>% dplyr::select("sender","receiver") %>%
  dplyr::filter(sender!=receiver) %>%
  dplyr::group_by(sender,receiver) %>% dplyr::summarise(weight=length(sender))
head(links)
## # A tibble: 6 x 3
## # Groups:   sender [1]
##   sender     receiver     weight
##   <chr>      <chr>         <int>
## 1 ????? ???? 99percenter       2
## 2 ????? ???? AdamT            14
## 3 ????? ???? AliHajiSheik      1
## 4 ????? ???? Amigo             1
## 5 ????? ???? apdst             3
## 6 ????? ???? areafiftyone      1

We introduce igraph to import edgelist

library(igraph)
net = graph.data.frame(links, nodes, directed=T)
net
## IGRAPH f2b2709 DNW- 1178 43815 -- 
## + attr: name (v/c), leanR (v/c), weight (e/n)
## + edges from f2b2709 (vertex names):
##  [1] ????? ????->99percenter     ????? ????->AdamT          
##  [3] ????? ????->AliHajiSheik    ????? ????->Amigo          
##  [5] ????? ????->apdst           ????? ????->areafiftyone   
##  [7] ????? ????->Aunt Spiker     ????? ????->Awesome!       
##  [9] ????? ????->beerftw         ????? ????->Bigfoot 88     
## [11] ????? ????->Billy the Kid   ????? ????->Binky          
## [13] ????? ????->Blue_State      ????? ????->Bobcat         
## [15] ????? ????->Boo Radley      ????? ????->Born Free      
## + ... omitted several edges
E(net) # The edges of the "net" object
## + 43815/43815 edges from f2b2709 (vertex names):
##  [1] ????? ????->99percenter     ????? ????->AdamT          
##  [3] ????? ????->AliHajiSheik    ????? ????->Amigo          
##  [5] ????? ????->apdst           ????? ????->areafiftyone   
##  [7] ????? ????->Aunt Spiker     ????? ????->Awesome!       
##  [9] ????? ????->beerftw         ????? ????->Bigfoot 88     
## [11] ????? ????->Billy the Kid   ????? ????->Binky          
## [13] ????? ????->Blue_State      ????? ????->Bobcat         
## [15] ????? ????->Boo Radley      ????? ????->Born Free      
## [17] ????? ????->Bronson         ????? ????->Caine          
## [19] ????? ????->Camlon          ????? ????->Captain America
## + ... omitted several edges
V(net) # The vertices of the "net" object
## + 1178/1178 vertices, named, from f2b2709:
##    [1] Gipper               Redress              soccerboy22         
##    [4] jasonxe              washunut             Catawba             
##    [7] TOJ                  Cecil James          _Markum_            
##   [10] Councilman           cpwill               cAPSLOCK            
##   [13] Ikari                Gina                 pbrauer             
##   [16] nota bene            Boo Radley           randel              
##   [19] Hugh_Akston          upsideguy            dixiesolutions      
##   [22] American             Top Cat              lpast               
##   [25] Risky Thicket        Crosscheck           Temporal            
##   [28] ChuckBerry           Fiddytree            Voltaire X          
## + ... omitted several vertices
hist(E(net)$weight) # Edge weight

table(V(net)$leanR) # Vertex attribute ideology
## 
## central    left   right 
##     559     261     358

We can also import matrix

mx = as.matrix(get.adjacency(net))
net2 = graph.adjacency(mx)
net2
## IGRAPH f348749 DN-- 1178 43815 -- 
## + attr: name (v/c)
## + edges from f348749 (vertex names):
##  [1] Gipper ->Redress      Gipper ->cpwill       Gipper ->Ikari       
##  [4] Gipper ->disneydude   Gipper ->poweRob      Gipper ->Josie       
##  [7] Gipper ->Kandahar     Gipper ->votewho2012  Gipper ->ThePlayDrive
## [10] Gipper ->sharon       Gipper ->Grim17       Gipper ->Travis007   
## [13] Gipper ->Carleen      Redress->Gipper       Redress->soccerboy22 
## [16] Redress->jasonxe      Redress->washunut     Redress->Catawba     
## [19] Redress->Councilman   Redress->cpwill       Redress->cAPSLOCK    
## [22] Redress->Ikari        Redress->pbrauer      Redress->nota bene   
## + ... omitted several edges

1.2. Individual level indicators

dc = degree(net)
head(sort(dc,decreasing = T))
##        AdamT      Redress   TurtleDude      poweRob       cpwill Conservative 
##          994          756          739          701          659          646
dgr = data.frame(ideology=V(net)$leanR,out_degree=degree(net,mode="out"),in_degree=degree(net,mode="in"))
head(dgr)
##             ideology out_degree in_degree
## Gipper         right         13        11
## Redress         left        331       425
## soccerboy22    right        114        90
## jasonxe        right        139       160
## washunut        left        127       135
## Catawba         left        229       189

OK, let’s do some basic calculations:

cor.test(dgr$out_degree,dgr$in_degree,method='spearman')
## 
##  Spearman's rank correlation rho
## 
## data:  dgr$out_degree and dgr$in_degree
## S = 26341425, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.9033159
dgr %>% dplyr::group_by(ideology) %>%
  dplyr::summarise(mean_d = mean(out_degree),
                   median_d = median(out_degree),
                   median_ind=median(in_degree))
## # A tibble: 3 x 4
##   ideology mean_d median_d median_ind
##   <chr>     <dbl>    <dbl>      <dbl>
## 1 central    25.4      5            5
## 2 left       48.3     14           16
## 3 right      47.5     15.5         15
summary(closeness(net))
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 7.212e-07 4.409e-06 4.418e-06 4.278e-06 4.422e-06 4.436e-06
summary(betweenness(net))
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     0.00     0.47    59.30  1667.29  1315.57 43034.95

1.3. Network level indicators

vcount(net) # 1178 nodes
## [1] 1178
ecount(net) # 43815 unique edges
## [1] 43815
plot(degree_distribution(net)) # degree distribution

mean(degree(net)) # mean degree
## [1] 74.38879
median(degree(net))
## [1] 19
graph.density(net) #0.03160102
## [1] 0.03160102
reciprocity(net) #0.6180075
## [1] 0.6180075
transitivity(net)
## [1] 0.3611749
triad.census(net)
##  [1] 240709063  15447217  11664348    232894    317637    381020    823100
##  [8]    778741     48772      6575    777241     49994     60153     71180
## [15]    241857    145384

1.4. Community structure

cpn=clusters(net)
#cpn
cmt=fastgreedy.community(as.undirected(net))
modularity(cmt)
## [1] 0.1758044
#cmt$membership
table(cmt$membership,V(net)$leanR)
##     
##      central left right
##   1      235  111   157
##   2      206   87   132
##   3      108   60    67
##   4        1    1     0
##   5        1    0     0
##   6        0    1     0
##   7        0    0     1
##   8        0    0     1
##   9        1    0     0
##   10       1    0     0
##   11       1    0     0
##   12       0    1     0
##   13       1    0     0
##   14       1    0     0
##   15       1    0     0
##   16       1    0     0
##   17       1    0     0

1.5. Coreness/Centralization

cores = coreness(net,mode="all")
head(sort(cores,decreasing = T))
##  Redress washunut  Catawba   cpwill    Ikari  pbrauer 
##      122      122      122      122      122      122
centr_degree(net)$centralization
## [1] 0.3909909
centr_betw(net)$centralization
## [1] 0.0708887

Any relationship between coreness and centralities?

tempd = data.frame(cores,degree(net),closeness(net),betweenness(net))
cor(tempd)
##                      cores degree.net. closeness.net. betweenness.net.
## cores            1.0000000   0.8324234     0.15837727       0.66506457
## degree.net.      0.8324234   1.0000000     0.11392931       0.93819420
## closeness.net.   0.1583773   0.1139293     1.00000000       0.08675634
## betweenness.net. 0.6650646   0.9381942     0.08675634       1.00000000

1.6. Data viz

Please refer to http://kateto.net/network-visualization

2. Network Formation Models

R igraph is good for descriptive statistics, whereas we need to switch to SNA for more advanced statistical models.

library(sna,quietly = T)

2.1. QAP

The input for QAP should be in matrix format:

dv = as.matrix(get.adjacency(net))

Create a match(similarity) matrix based on the node attributes (ideology):

head(nodes)
##         sender leanR
## 1       Gipper right
## 2      Redress  left
## 3  soccerboy22 right
## 8      jasonxe right
## 9     washunut  left
## 12     Catawba  left
mm = outer(nodes$leanR, nodes$leanR, function(a, b) as.integer(a == b))
diag(mm) = 0
dimnames(mm) = list(nodes$sender,nodes$sender)
mm[1:5,1:5]
##             Gipper Redress soccerboy22 jasonxe washunut
## Gipper           0       0           1       1        0
## Redress          0       0           0       0        1
## soccerboy22      1       0           0       1        0
## jasonxe          1       0           1       0        0
## washunut         0       1           0       0        0

Now, we can do the qaptest:

gcor(dv,mm)
## [1] -0.02610566

Test the correlation using qaptest: qaptest(A.List.Of.Networks, Some.Network.Function, reps=100), where reps is the number of graphs to be generated.

Out of a 100 permutations, how many had >= or <= correlation than the observed value? (the parameters g1=1 g2=2 tell the test to compare the first & second element of the list of networks that is the first argument of qaptest)

reply.qap <- qaptest(list(dv,mm), gcor, g1=1, g2=2, reps=100)
reply.qap
## 
## QAP Test Results
## 
## Estimated p-values:
##  p(f(perm) >= f(d)): 1 
##  p(f(perm) <= f(d)): 0
plot(reply.qap)

If we have multiple IVs (very slow):

IVs = rgraph(nrow(dv), 3) 
IVs[1,,] = mm

net.mod = netlm(dv, IVs, reps=100)
summary(net.mod)
## 
## OLS Network Model
## 
## Residuals:
##          0%         25%         50%         75%        100% 
## -0.03525915 -0.03518878 -0.03488432 -0.02570863  0.97459582 
## 
## Coefficients:
##             Estimate      Pr(<=b) Pr(>=b) Pr(>=|b|)
## (intercept)  3.518878e-02 1.00    0.00    0.00     
## x1          -9.480145e-03 0.00    1.00    0.00     
## x2           7.036824e-05 0.56    0.44    0.86     
## x3          -3.044567e-04 0.23    0.77    0.39     
## 
## Residual standard error: 0.1749 on 1386502 degrees of freedom
## Multiple R-squared: 0.0006823    Adjusted R-squared: 0.0006801 
## F-statistic: 315.6 on 3 and 1386502 degrees of freedom, p-value:     0 
## 
## 
## Test Diagnostics:
## 
##  Null Hypothesis: qap 
##  Replications: 100 
##  Coefficient Distribution Summary:
## 
##        (intercept)       x1       x2       x3
## Min       34.31939 -9.76649 -2.46050 -2.54961
## 1stQ      37.60962 -2.78665 -0.66790 -0.94907
## Median    38.96029  0.10826 -0.01245 -0.36752
## Mean      38.83054  0.45964 -0.02175 -0.21726
## 3rdQ      40.00869  3.65496  0.63604  0.51660
## Max       43.57685 13.25892  2.18323  2.89408

2.2(a). ERGM

Outline; 1) ERGM creation 2) MCMC diagnostics 3) ERGM model simulation 4) Model comparison 5) ERGM performance improvements

# load the "ergm" library
library(ergm)
library(sna)

# Specify the network we'll call net - where dyads 
# are the unit of analysis...
net.ergm = network(links[,1:2])

list.vertex.attributes(net.ergm)
## [1] "na"           "vertex.names"
v.names = get.vertex.attribute(net.ergm,'vertex.names')

Reorder the attribute value and assign to network object:

x = match(v.names,nodes$sender)
ideology = nodes$leanR[x]
# Assign node-level attributes to actors in "net"
net.ergm %v% "ideology" <- ideology

Estimate the models:

# ERGM
summary(net.ergm~edges+mutual+triangle) #43815 edges+ 2548501 triads
##    edges   mutual triangle 
##    43815    13539  2548501
m = ergm(net.ergm ~ edges + mutual + gwidegree + gwesp + nodefactor("ideology") + nodematch("ideology"),estimate = "MPLE",verbose=FALSE)
summary(m)
## Call:
## ergm(formula = net.ergm ~ edges + mutual + gwidegree + gwesp + 
##     nodefactor("ideology") + nodematch("ideology"), estimate = "MPLE", 
##     verbose = FALSE)
## 
## Maximum Pseudolikelihood Results:
## 
##                             Estimate Std. Error MCMC % z value Pr(>|z|)    
## edges                     -3.182e+00  1.248e-02      0 -255.02   <1e-04 ***
## mutual                     3.189e+00  1.266e-02      0  251.91   <1e-04 ***
## gwidegree                 -8.870e-01  1.956e-02      0  -45.35   <1e-04 ***
## gwidegree.decay            4.035e-01  2.713e-02      0   14.87   <1e-04 ***
## gwesp                     -6.304e-01  1.248e-02      0  -50.49   <1e-04 ***
## gwesp.decay                1.529e-13  1.966e-02      0    0.00        1    
## nodefactor.ideology.left  -1.590e-01  1.046e-02      0  -15.20   <1e-04 ***
## nodefactor.ideology.right -1.319e-01  9.466e-03      0  -13.94   <1e-04 ***
## nodematch.ideology        -4.743e-01  1.211e-02      0  -39.16   <1e-04 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Warning:  The standard errors are based on naive pseudolikelihood and are suspect.
## 
##      Null Pseudo-deviance: 1879898  on 1356060  degrees of freedom
##  Residual Pseudo-deviance:  239549  on 1356051  degrees of freedom
##  
## AIC: 239567  BIC: 239676  (Smaller is better. MC Std. Err. = 0)

The ERGM runs by an MCMC process with multiple starts, and this helps you see if the model is converging. If the estimated coefficient values were to change dramatically, it might be a sign of a poorly specified model). You should see the log-likelihood increase with each iteration. You will see a loop warning. You can ignore this for now. Before trying to interpret the data, it is a good idea to check the MCMC process. Also see link.

# for MCMC algorithm only
# mcmc.diagnostics(m)

You will see several plots and output. The plots to the right should look approximately normal. The output tells us three things of interest: 1) The accuracy of the model (r) 2) If we used a sufficiently large burn-in 3) If we used a sufficiently large sample in the simulation

In our case the samples might be too small. This doesn’t mean the results of the ERGM results are wrong, but we should take care in specifying the sample size. Let’s look at the summary of the results. We could create a new object that is the summary score info, but here we’ll just send it to the screen.

Let’s assess the model

summary(m)
## Call:
## ergm(formula = net.ergm ~ edges + mutual + gwidegree + gwesp + 
##     nodefactor("ideology") + nodematch("ideology"), estimate = "MPLE", 
##     verbose = FALSE)
## 
## Maximum Pseudolikelihood Results:
## 
##                             Estimate Std. Error MCMC % z value Pr(>|z|)    
## edges                     -3.182e+00  1.248e-02      0 -255.02   <1e-04 ***
## mutual                     3.189e+00  1.266e-02      0  251.91   <1e-04 ***
## gwidegree                 -8.870e-01  1.956e-02      0  -45.35   <1e-04 ***
## gwidegree.decay            4.035e-01  2.713e-02      0   14.87   <1e-04 ***
## gwesp                     -6.304e-01  1.248e-02      0  -50.49   <1e-04 ***
## gwesp.decay                1.529e-13  1.966e-02      0    0.00        1    
## nodefactor.ideology.left  -1.590e-01  1.046e-02      0  -15.20   <1e-04 ***
## nodefactor.ideology.right -1.319e-01  9.466e-03      0  -13.94   <1e-04 ***
## nodematch.ideology        -4.743e-01  1.211e-02      0  -39.16   <1e-04 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Warning:  The standard errors are based on naive pseudolikelihood and are suspect.
## 
##      Null Pseudo-deviance: 1879898  on 1356060  degrees of freedom
##  Residual Pseudo-deviance:  239549  on 1356051  degrees of freedom
##  
## AIC: 239567  BIC: 239676  (Smaller is better. MC Std. Err. = 0)
lapply(m[1],exp)
## $coefficients
##                     edges                    mutual                 gwidegree 
##                 0.0414875                24.2736368                 0.4118708 
##           gwidegree.decay                     gwesp               gwesp.decay 
##                 1.4970812                 0.5323851                 1.0000000 
##  nodefactor.ideology.left nodefactor.ideology.right        nodematch.ideology 
##                 0.8530085                 0.8763927                 0.6223244

The first section gives the model (formula) estimated in the ERGM. Here we said that the network was a function of edges, mutual ties and matching with respect ideology. Another way to think about this, is that we’re generating random networks that match the observed network with respect to the number of edges, the number of mutual dyads, the number of ties within/between ideology.

The second section tells how many iterations were done. The MCMC sample size is the number of random networks generated in the production of the estimate.

The next section gives the coefficients, their SEs and pValues. These are on a log-odds scale, so we interpret them like logit coefficients.

An edges value of -4.90694 means that the odds of seeing a tie on any dyad are exp(-4.90694) = 0.007395083, which could be thought of as density net of the other factors in the model. If you only have ‘edges’ in the model, then exp(b1) should be very close to the observed density. Edges are equivalent to a model intercept – while possible, I can’t imagine why one would estimate a model without an edges parameter.

A mutual value of 4.08642 means that reciprocity is more common than expected by chance (positive and significant), and here we see that exp(4.08642)=59.526674874, so it’s much more likely than chance – we are 60 times as likely to see a tie from ij if ji than if j did not nominate i.

We are exp(-0.22823)=0.795942220 times likely to nominate within ideology than across ideology.

The final section refers to overall model fit and MCMC diagnostic statistics (AIC, BIC).

fit = gof(m ~ odegree + idegree + espartners + distance)
## Warning in gof.formula(object = object$formula, coef = coef, GOF = GOF, : No
## parameter values given, using 0.
#fit
plot(fit)

2.2(b). Valued ERGM

See link

2.2(c). TERGM

2.3. LSM: Latent Space Model

library(eigenmodel) #symmetric relational data
mtx = as.matrix(get.adjacency(as.undirected(net),sparse=FALSE,type='both'))
lsm.fit1 <- eigenmodel_mcmc(Y=mtx, R=2,S=150,burn=100)
## 5 percent done (iteration 13) Mon Apr 04 20:35:12 2022
## 10 percent done (iteration 25) Mon Apr 04 20:35:22 2022
## 15 percent done (iteration 38) Mon Apr 04 20:35:31 2022
## 20 percent done (iteration 50) Mon Apr 04 20:35:39 2022
## 25 percent done (iteration 63) Mon Apr 04 20:35:48 2022
## 30 percent done (iteration 75) Mon Apr 04 20:35:57 2022
## 35 percent done (iteration 88) Mon Apr 04 20:36:06 2022
## 40 percent done (iteration 100) Mon Apr 04 20:36:15 2022
## 45 percent done (iteration 113) Mon Apr 04 20:36:25 2022
## 50 percent done (iteration 125) Mon Apr 04 20:36:35 2022
## 55 percent done (iteration 138) Mon Apr 04 20:36:46 2022
## 60 percent done (iteration 150) Mon Apr 04 20:36:55 2022
## 65 percent done (iteration 163) Mon Apr 04 20:37:05 2022
## 70 percent done (iteration 175) Mon Apr 04 20:37:16 2022
## 75 percent done (iteration 188) Mon Apr 04 20:37:27 2022
## 80 percent done (iteration 200) Mon Apr 04 20:37:37 2022
## 85 percent done (iteration 213) Mon Apr 04 20:37:48 2022
## 90 percent done (iteration 225) Mon Apr 04 20:37:57 2022
## 95 percent done (iteration 238) Mon Apr 04 20:38:08 2022
## 100 percent done (iteration 250) Mon Apr 04 20:38:17 2022
vec = eigen(lsm.fit1$ULU_postmean)$vec[, 1:2]
plot(lsm.fit1)

IVs = array(mm,dim=c(1178,1178,1))
lsm.fit2 <- eigenmodel_mcmc(Y=mtx, X=IVs, R=2,S=150,burn=20)
## 5 percent done (iteration 9) Mon Apr 04 20:38:35 2022
## 10 percent done (iteration 17) Mon Apr 04 20:38:41 2022
## 15 percent done (iteration 26) Mon Apr 04 20:38:48 2022
## 20 percent done (iteration 34) Mon Apr 04 20:38:54 2022
## 25 percent done (iteration 43) Mon Apr 04 20:39:02 2022
## 30 percent done (iteration 51) Mon Apr 04 20:39:10 2022
## 35 percent done (iteration 60) Mon Apr 04 20:39:17 2022
## 40 percent done (iteration 68) Mon Apr 04 20:39:25 2022
## 45 percent done (iteration 77) Mon Apr 04 20:39:33 2022
## 50 percent done (iteration 85) Mon Apr 04 20:39:39 2022
## 55 percent done (iteration 94) Mon Apr 04 20:39:47 2022
## 60 percent done (iteration 102) Mon Apr 04 20:39:54 2022
## 65 percent done (iteration 111) Mon Apr 04 20:40:02 2022
## 70 percent done (iteration 119) Mon Apr 04 20:40:09 2022
## 75 percent done (iteration 128) Mon Apr 04 20:40:17 2022
## 80 percent done (iteration 136) Mon Apr 04 20:40:23 2022
## 85 percent done (iteration 145) Mon Apr 04 20:40:31 2022
## 90 percent done (iteration 153) Mon Apr 04 20:40:37 2022
## 95 percent done (iteration 162) Mon Apr 04 20:40:45 2022
## 100 percent done (iteration 170) Mon Apr 04 20:40:52 2022
plot(lsm.fit2)