We will use the SNA data, which were collected form a US discussion forum. The rows indicate replies between users. All users have self-reported political ideology. The data were stored in edgelist format.
load("SNA.Rdata")
nodes = SNA[,c("sender","leanR")]
nodes = nodes[!duplicated(nodes$sender),] #1178 users
table(nodes$leanR)
##
## central left right
## 559 261 358
library(dplyr)
links = SNA %>% dplyr::select("sender","receiver") %>%
dplyr::filter(sender!=receiver) %>%
dplyr::group_by(sender,receiver) %>% dplyr::summarise(weight=length(sender))
head(links)
## # A tibble: 6 x 3
## # Groups: sender [1]
## sender receiver weight
## <chr> <chr> <int>
## 1 ????? ???? 99percenter 2
## 2 ????? ???? AdamT 14
## 3 ????? ???? AliHajiSheik 1
## 4 ????? ???? Amigo 1
## 5 ????? ???? apdst 3
## 6 ????? ???? areafiftyone 1
We introduce igraph to import edgelist
library(igraph)
net = graph.data.frame(links, nodes, directed=T)
net
## IGRAPH f2b2709 DNW- 1178 43815 --
## + attr: name (v/c), leanR (v/c), weight (e/n)
## + edges from f2b2709 (vertex names):
## [1] ????? ????->99percenter ????? ????->AdamT
## [3] ????? ????->AliHajiSheik ????? ????->Amigo
## [5] ????? ????->apdst ????? ????->areafiftyone
## [7] ????? ????->Aunt Spiker ????? ????->Awesome!
## [9] ????? ????->beerftw ????? ????->Bigfoot 88
## [11] ????? ????->Billy the Kid ????? ????->Binky
## [13] ????? ????->Blue_State ????? ????->Bobcat
## [15] ????? ????->Boo Radley ????? ????->Born Free
## + ... omitted several edges
E(net) # The edges of the "net" object
## + 43815/43815 edges from f2b2709 (vertex names):
## [1] ????? ????->99percenter ????? ????->AdamT
## [3] ????? ????->AliHajiSheik ????? ????->Amigo
## [5] ????? ????->apdst ????? ????->areafiftyone
## [7] ????? ????->Aunt Spiker ????? ????->Awesome!
## [9] ????? ????->beerftw ????? ????->Bigfoot 88
## [11] ????? ????->Billy the Kid ????? ????->Binky
## [13] ????? ????->Blue_State ????? ????->Bobcat
## [15] ????? ????->Boo Radley ????? ????->Born Free
## [17] ????? ????->Bronson ????? ????->Caine
## [19] ????? ????->Camlon ????? ????->Captain America
## + ... omitted several edges
V(net) # The vertices of the "net" object
## + 1178/1178 vertices, named, from f2b2709:
## [1] Gipper Redress soccerboy22
## [4] jasonxe washunut Catawba
## [7] TOJ Cecil James _Markum_
## [10] Councilman cpwill cAPSLOCK
## [13] Ikari Gina pbrauer
## [16] nota bene Boo Radley randel
## [19] Hugh_Akston upsideguy dixiesolutions
## [22] American Top Cat lpast
## [25] Risky Thicket Crosscheck Temporal
## [28] ChuckBerry Fiddytree Voltaire X
## + ... omitted several vertices
hist(E(net)$weight) # Edge weight
table(V(net)$leanR) # Vertex attribute ideology
##
## central left right
## 559 261 358
We can also import matrix
mx = as.matrix(get.adjacency(net))
net2 = graph.adjacency(mx)
net2
## IGRAPH f348749 DN-- 1178 43815 --
## + attr: name (v/c)
## + edges from f348749 (vertex names):
## [1] Gipper ->Redress Gipper ->cpwill Gipper ->Ikari
## [4] Gipper ->disneydude Gipper ->poweRob Gipper ->Josie
## [7] Gipper ->Kandahar Gipper ->votewho2012 Gipper ->ThePlayDrive
## [10] Gipper ->sharon Gipper ->Grim17 Gipper ->Travis007
## [13] Gipper ->Carleen Redress->Gipper Redress->soccerboy22
## [16] Redress->jasonxe Redress->washunut Redress->Catawba
## [19] Redress->Councilman Redress->cpwill Redress->cAPSLOCK
## [22] Redress->Ikari Redress->pbrauer Redress->nota bene
## + ... omitted several edges
dc = degree(net)
head(sort(dc,decreasing = T))
## AdamT Redress TurtleDude poweRob cpwill Conservative
## 994 756 739 701 659 646
dgr = data.frame(ideology=V(net)$leanR,out_degree=degree(net,mode="out"),in_degree=degree(net,mode="in"))
head(dgr)
## ideology out_degree in_degree
## Gipper right 13 11
## Redress left 331 425
## soccerboy22 right 114 90
## jasonxe right 139 160
## washunut left 127 135
## Catawba left 229 189
OK, let’s do some basic calculations:
cor.test(dgr$out_degree,dgr$in_degree,method='spearman')
##
## Spearman's rank correlation rho
##
## data: dgr$out_degree and dgr$in_degree
## S = 26341425, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.9033159
dgr %>% dplyr::group_by(ideology) %>%
dplyr::summarise(mean_d = mean(out_degree),
median_d = median(out_degree),
median_ind=median(in_degree))
## # A tibble: 3 x 4
## ideology mean_d median_d median_ind
## <chr> <dbl> <dbl> <dbl>
## 1 central 25.4 5 5
## 2 left 48.3 14 16
## 3 right 47.5 15.5 15
summary(closeness(net))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.212e-07 4.409e-06 4.418e-06 4.278e-06 4.422e-06 4.436e-06
summary(betweenness(net))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.47 59.30 1667.29 1315.57 43034.95
vcount(net) # 1178 nodes
## [1] 1178
ecount(net) # 43815 unique edges
## [1] 43815
plot(degree_distribution(net)) # degree distribution
mean(degree(net)) # mean degree
## [1] 74.38879
median(degree(net))
## [1] 19
graph.density(net) #0.03160102
## [1] 0.03160102
reciprocity(net) #0.6180075
## [1] 0.6180075
transitivity(net)
## [1] 0.3611749
triad.census(net)
## [1] 240709063 15447217 11664348 232894 317637 381020 823100
## [8] 778741 48772 6575 777241 49994 60153 71180
## [15] 241857 145384
cpn=clusters(net)
#cpn
cmt=fastgreedy.community(as.undirected(net))
modularity(cmt)
## [1] 0.1758044
#cmt$membership
table(cmt$membership,V(net)$leanR)
##
## central left right
## 1 235 111 157
## 2 206 87 132
## 3 108 60 67
## 4 1 1 0
## 5 1 0 0
## 6 0 1 0
## 7 0 0 1
## 8 0 0 1
## 9 1 0 0
## 10 1 0 0
## 11 1 0 0
## 12 0 1 0
## 13 1 0 0
## 14 1 0 0
## 15 1 0 0
## 16 1 0 0
## 17 1 0 0
cores = coreness(net,mode="all")
head(sort(cores,decreasing = T))
## Redress washunut Catawba cpwill Ikari pbrauer
## 122 122 122 122 122 122
centr_degree(net)$centralization
## [1] 0.3909909
centr_betw(net)$centralization
## [1] 0.0708887
Any relationship between coreness and centralities?
tempd = data.frame(cores,degree(net),closeness(net),betweenness(net))
cor(tempd)
## cores degree.net. closeness.net. betweenness.net.
## cores 1.0000000 0.8324234 0.15837727 0.66506457
## degree.net. 0.8324234 1.0000000 0.11392931 0.93819420
## closeness.net. 0.1583773 0.1139293 1.00000000 0.08675634
## betweenness.net. 0.6650646 0.9381942 0.08675634 1.00000000
Please refer to http://kateto.net/network-visualization
R igraph is good for descriptive statistics, whereas we need to switch to SNA for more advanced statistical models.
library(sna,quietly = T)
The input for QAP should be in matrix format:
dv = as.matrix(get.adjacency(net))
Create a match(similarity) matrix based on the node attributes (ideology):
head(nodes)
## sender leanR
## 1 Gipper right
## 2 Redress left
## 3 soccerboy22 right
## 8 jasonxe right
## 9 washunut left
## 12 Catawba left
mm = outer(nodes$leanR, nodes$leanR, function(a, b) as.integer(a == b))
diag(mm) = 0
dimnames(mm) = list(nodes$sender,nodes$sender)
mm[1:5,1:5]
## Gipper Redress soccerboy22 jasonxe washunut
## Gipper 0 0 1 1 0
## Redress 0 0 0 0 1
## soccerboy22 1 0 0 1 0
## jasonxe 1 0 1 0 0
## washunut 0 1 0 0 0
Now, we can do the qaptest:
gcor(dv,mm)
## [1] -0.02610566
Test the correlation using qaptest: qaptest(A.List.Of.Networks, Some.Network.Function, reps=100), where reps is the number of graphs to be generated.
Out of a 100 permutations, how many had >= or <= correlation than the observed value? (the parameters g1=1 g2=2 tell the test to compare the first & second element of the list of networks that is the first argument of qaptest)
reply.qap <- qaptest(list(dv,mm), gcor, g1=1, g2=2, reps=100)
reply.qap
##
## QAP Test Results
##
## Estimated p-values:
## p(f(perm) >= f(d)): 1
## p(f(perm) <= f(d)): 0
plot(reply.qap)
If we have multiple IVs (very slow):
IVs = rgraph(nrow(dv), 3)
IVs[1,,] = mm
net.mod = netlm(dv, IVs, reps=100)
summary(net.mod)
##
## OLS Network Model
##
## Residuals:
## 0% 25% 50% 75% 100%
## -0.03525915 -0.03518878 -0.03488432 -0.02570863 0.97459582
##
## Coefficients:
## Estimate Pr(<=b) Pr(>=b) Pr(>=|b|)
## (intercept) 3.518878e-02 1.00 0.00 0.00
## x1 -9.480145e-03 0.00 1.00 0.00
## x2 7.036824e-05 0.56 0.44 0.86
## x3 -3.044567e-04 0.23 0.77 0.39
##
## Residual standard error: 0.1749 on 1386502 degrees of freedom
## Multiple R-squared: 0.0006823 Adjusted R-squared: 0.0006801
## F-statistic: 315.6 on 3 and 1386502 degrees of freedom, p-value: 0
##
##
## Test Diagnostics:
##
## Null Hypothesis: qap
## Replications: 100
## Coefficient Distribution Summary:
##
## (intercept) x1 x2 x3
## Min 34.31939 -9.76649 -2.46050 -2.54961
## 1stQ 37.60962 -2.78665 -0.66790 -0.94907
## Median 38.96029 0.10826 -0.01245 -0.36752
## Mean 38.83054 0.45964 -0.02175 -0.21726
## 3rdQ 40.00869 3.65496 0.63604 0.51660
## Max 43.57685 13.25892 2.18323 2.89408
Outline; 1) ERGM creation 2) MCMC diagnostics 3) ERGM model simulation 4) Model comparison 5) ERGM performance improvements
# load the "ergm" library
library(ergm)
library(sna)
# Specify the network we'll call net - where dyads
# are the unit of analysis...
net.ergm = network(links[,1:2])
list.vertex.attributes(net.ergm)
## [1] "na" "vertex.names"
v.names = get.vertex.attribute(net.ergm,'vertex.names')
Reorder the attribute value and assign to network object:
x = match(v.names,nodes$sender)
ideology = nodes$leanR[x]
# Assign node-level attributes to actors in "net"
net.ergm %v% "ideology" <- ideology
Estimate the models:
# ERGM
summary(net.ergm~edges+mutual+triangle) #43815 edges+ 2548501 triads
## edges mutual triangle
## 43815 13539 2548501
m = ergm(net.ergm ~ edges + mutual + gwidegree + gwesp + nodefactor("ideology") + nodematch("ideology"),estimate = "MPLE",verbose=FALSE)
summary(m)
## Call:
## ergm(formula = net.ergm ~ edges + mutual + gwidegree + gwesp +
## nodefactor("ideology") + nodematch("ideology"), estimate = "MPLE",
## verbose = FALSE)
##
## Maximum Pseudolikelihood Results:
##
## Estimate Std. Error MCMC % z value Pr(>|z|)
## edges -3.182e+00 1.248e-02 0 -255.02 <1e-04 ***
## mutual 3.189e+00 1.266e-02 0 251.91 <1e-04 ***
## gwidegree -8.870e-01 1.956e-02 0 -45.35 <1e-04 ***
## gwidegree.decay 4.035e-01 2.713e-02 0 14.87 <1e-04 ***
## gwesp -6.304e-01 1.248e-02 0 -50.49 <1e-04 ***
## gwesp.decay 1.529e-13 1.966e-02 0 0.00 1
## nodefactor.ideology.left -1.590e-01 1.046e-02 0 -15.20 <1e-04 ***
## nodefactor.ideology.right -1.319e-01 9.466e-03 0 -13.94 <1e-04 ***
## nodematch.ideology -4.743e-01 1.211e-02 0 -39.16 <1e-04 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Warning: The standard errors are based on naive pseudolikelihood and are suspect.
##
## Null Pseudo-deviance: 1879898 on 1356060 degrees of freedom
## Residual Pseudo-deviance: 239549 on 1356051 degrees of freedom
##
## AIC: 239567 BIC: 239676 (Smaller is better. MC Std. Err. = 0)
The ERGM runs by an MCMC process with multiple starts, and this helps you see if the model is converging. If the estimated coefficient values were to change dramatically, it might be a sign of a poorly specified model). You should see the log-likelihood increase with each iteration. You will see a loop warning. You can ignore this for now. Before trying to interpret the data, it is a good idea to check the MCMC process. Also see link.
# for MCMC algorithm only
# mcmc.diagnostics(m)
You will see several plots and output. The plots to the right should look approximately normal. The output tells us three things of interest: 1) The accuracy of the model (r) 2) If we used a sufficiently large burn-in 3) If we used a sufficiently large sample in the simulation
In our case the samples might be too small. This doesn’t mean the results of the ERGM results are wrong, but we should take care in specifying the sample size. Let’s look at the summary of the results. We could create a new object that is the summary score info, but here we’ll just send it to the screen.
Let’s assess the model
summary(m)
## Call:
## ergm(formula = net.ergm ~ edges + mutual + gwidegree + gwesp +
## nodefactor("ideology") + nodematch("ideology"), estimate = "MPLE",
## verbose = FALSE)
##
## Maximum Pseudolikelihood Results:
##
## Estimate Std. Error MCMC % z value Pr(>|z|)
## edges -3.182e+00 1.248e-02 0 -255.02 <1e-04 ***
## mutual 3.189e+00 1.266e-02 0 251.91 <1e-04 ***
## gwidegree -8.870e-01 1.956e-02 0 -45.35 <1e-04 ***
## gwidegree.decay 4.035e-01 2.713e-02 0 14.87 <1e-04 ***
## gwesp -6.304e-01 1.248e-02 0 -50.49 <1e-04 ***
## gwesp.decay 1.529e-13 1.966e-02 0 0.00 1
## nodefactor.ideology.left -1.590e-01 1.046e-02 0 -15.20 <1e-04 ***
## nodefactor.ideology.right -1.319e-01 9.466e-03 0 -13.94 <1e-04 ***
## nodematch.ideology -4.743e-01 1.211e-02 0 -39.16 <1e-04 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Warning: The standard errors are based on naive pseudolikelihood and are suspect.
##
## Null Pseudo-deviance: 1879898 on 1356060 degrees of freedom
## Residual Pseudo-deviance: 239549 on 1356051 degrees of freedom
##
## AIC: 239567 BIC: 239676 (Smaller is better. MC Std. Err. = 0)
lapply(m[1],exp)
## $coefficients
## edges mutual gwidegree
## 0.0414875 24.2736368 0.4118708
## gwidegree.decay gwesp gwesp.decay
## 1.4970812 0.5323851 1.0000000
## nodefactor.ideology.left nodefactor.ideology.right nodematch.ideology
## 0.8530085 0.8763927 0.6223244
The first section gives the model (formula) estimated in the ERGM. Here we said that the network was a function of edges, mutual ties and matching with respect ideology. Another way to think about this, is that we’re generating random networks that match the observed network with respect to the number of edges, the number of mutual dyads, the number of ties within/between ideology.
The second section tells how many iterations were done. The MCMC sample size is the number of random networks generated in the production of the estimate.
The next section gives the coefficients, their SEs and pValues. These are on a log-odds scale, so we interpret them like logit coefficients.
An edges value of -4.90694 means that the odds of seeing a tie on any dyad are exp(-4.90694) = 0.007395083, which could be thought of as density net of the other factors in the model. If you only have ‘edges’ in the model, then exp(b1) should be very close to the observed density. Edges are equivalent to a model intercept – while possible, I can’t imagine why one would estimate a model without an edges parameter.
A mutual value of 4.08642 means that reciprocity is more common than expected by chance (positive and significant), and here we see that exp(4.08642)=59.526674874, so it’s much more likely than chance – we are 60 times as likely to see a tie from ij if ji than if j did not nominate i.
We are exp(-0.22823)=0.795942220 times likely to nominate within ideology than across ideology.
The final section refers to overall model fit and MCMC diagnostic statistics (AIC, BIC).
fit = gof(m ~ odegree + idegree + espartners + distance)
## Warning in gof.formula(object = object$formula, coef = coef, GOF = GOF, : No
## parameter values given, using 0.
#fit
plot(fit)
See link
library(eigenmodel) #symmetric relational data
mtx = as.matrix(get.adjacency(as.undirected(net),sparse=FALSE,type='both'))
lsm.fit1 <- eigenmodel_mcmc(Y=mtx, R=2,S=150,burn=100)
## 5 percent done (iteration 13) Mon Apr 04 20:35:12 2022
## 10 percent done (iteration 25) Mon Apr 04 20:35:22 2022
## 15 percent done (iteration 38) Mon Apr 04 20:35:31 2022
## 20 percent done (iteration 50) Mon Apr 04 20:35:39 2022
## 25 percent done (iteration 63) Mon Apr 04 20:35:48 2022
## 30 percent done (iteration 75) Mon Apr 04 20:35:57 2022
## 35 percent done (iteration 88) Mon Apr 04 20:36:06 2022
## 40 percent done (iteration 100) Mon Apr 04 20:36:15 2022
## 45 percent done (iteration 113) Mon Apr 04 20:36:25 2022
## 50 percent done (iteration 125) Mon Apr 04 20:36:35 2022
## 55 percent done (iteration 138) Mon Apr 04 20:36:46 2022
## 60 percent done (iteration 150) Mon Apr 04 20:36:55 2022
## 65 percent done (iteration 163) Mon Apr 04 20:37:05 2022
## 70 percent done (iteration 175) Mon Apr 04 20:37:16 2022
## 75 percent done (iteration 188) Mon Apr 04 20:37:27 2022
## 80 percent done (iteration 200) Mon Apr 04 20:37:37 2022
## 85 percent done (iteration 213) Mon Apr 04 20:37:48 2022
## 90 percent done (iteration 225) Mon Apr 04 20:37:57 2022
## 95 percent done (iteration 238) Mon Apr 04 20:38:08 2022
## 100 percent done (iteration 250) Mon Apr 04 20:38:17 2022
vec = eigen(lsm.fit1$ULU_postmean)$vec[, 1:2]
plot(lsm.fit1)
IVs = array(mm,dim=c(1178,1178,1))
lsm.fit2 <- eigenmodel_mcmc(Y=mtx, X=IVs, R=2,S=150,burn=20)
## 5 percent done (iteration 9) Mon Apr 04 20:38:35 2022
## 10 percent done (iteration 17) Mon Apr 04 20:38:41 2022
## 15 percent done (iteration 26) Mon Apr 04 20:38:48 2022
## 20 percent done (iteration 34) Mon Apr 04 20:38:54 2022
## 25 percent done (iteration 43) Mon Apr 04 20:39:02 2022
## 30 percent done (iteration 51) Mon Apr 04 20:39:10 2022
## 35 percent done (iteration 60) Mon Apr 04 20:39:17 2022
## 40 percent done (iteration 68) Mon Apr 04 20:39:25 2022
## 45 percent done (iteration 77) Mon Apr 04 20:39:33 2022
## 50 percent done (iteration 85) Mon Apr 04 20:39:39 2022
## 55 percent done (iteration 94) Mon Apr 04 20:39:47 2022
## 60 percent done (iteration 102) Mon Apr 04 20:39:54 2022
## 65 percent done (iteration 111) Mon Apr 04 20:40:02 2022
## 70 percent done (iteration 119) Mon Apr 04 20:40:09 2022
## 75 percent done (iteration 128) Mon Apr 04 20:40:17 2022
## 80 percent done (iteration 136) Mon Apr 04 20:40:23 2022
## 85 percent done (iteration 145) Mon Apr 04 20:40:31 2022
## 90 percent done (iteration 153) Mon Apr 04 20:40:37 2022
## 95 percent done (iteration 162) Mon Apr 04 20:40:45 2022
## 100 percent done (iteration 170) Mon Apr 04 20:40:52 2022
plot(lsm.fit2)