Last active
January 3, 2016 05:29
-
-
Save computermacgyver/8416496 to your computer and use it in GitHub Desktop.
Analysis of Twitter mentions/retweets network for #CHI2014 paper "Global Connectivity and Multilinguals in the Twitter Network". #R #igraph
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/R | |
| # Main analysis for "Global Connectivity and Multilinguals in the Twitter Network" | |
| # paper. See http://www.scotthale.net/pubs/?chi2014 for details. | |
| # | |
| # Author: Scott A. Hale (http://www.scotthale.net/) | |
| # License: GPLv2 | |
| # If you use this in support of an academic publication, please cite: | |
| # | |
| # Hale, S. A. (2014) Global Connectivity and Multilinguals in the Twitter Network. | |
| # In Proceedings of the 2014 ACM Annual Conference on Human Factors in Computing Systems, | |
| # ACM (Montreal, Canada). | |
| # | |
| # More details, related code, and the original academic paper using this code | |
| # is available at http://www.scotthale.net/pubs/?chi2014 | |
| # | |
| # Requires density_functions.R available at https://gist.github.com/computermacgyver/8416453 | |
| library(igraph) | |
| library(ggplot2) | |
| library(scales) | |
| library(gridExtra) | |
| library(plyr) | |
| library(parallel) | |
| library(foreach) | |
| library(doParallel) | |
| source("density_functions.R") #This file available at https://gist.github.com/computermacgyver/8416453 | |
| #if non-interactive and --save flag use the following to save the enviornment before quitting on an error | |
| #options(error = quote(q("yes"))) | |
| NUM.CORES<-5 | |
| NUM.RUNS<-100 | |
| if (NUM.RUNS<NUM.CORES) { | |
| NUM.CORES<-NUM.RUNS#Only use one core if we are running only once | |
| } | |
| #TODO: Consider average time between tweets -- remove lowest quartile as bots (?) | |
| #TODO: Combine Indonesian and Malay together! | |
| g<-read.graph("../../hadoop-twitter/graph_ge4_ge2-20_clean-msin_anon.graphml",format="graphml") | |
| components<-decompose.graph(g,mode="weak", min.vertices = 100) | |
| gLCC<-components[[1]] | |
| rm(g) | |
| pngWidth<-800 | |
| pngHeight<-700 | |
| svgWidth<-8 | |
| svgHeight<-8 | |
| summary(V(gLCC)$tweetCount) | |
| # summary(V(gLCC)$tweetCount) | |
| # Min. 1st Qu. Median Mean 3rd Qu. Max. | |
| # 1.000 1.000 2.000 3.615 4.000 343.000 | |
| #Filter to require at least one *in* edge | |
| V(gLCC)$inDegree<-degree(gLCC, mode = "in") | |
| summary(V(gLCC)$inDegree) | |
| gMin5<-induced.subgraph(gLCC,which(V(gLCC)$inDegree>0 & V(gLCC)$tweetCountAdj>0 & V(gLCC)$tweetCount>=4)) | |
| components<-decompose.graph(gMin5,mode="weak", min.vertices = 100) | |
| gMin5LCC<-components[[1]] | |
| #rm(gLCC) | |
| rm(gMin5) | |
| #clusters<-edge.betweenness.community(gMin5LCC) | |
| #sizes(clusters) | |
| summary(gMin5LCC) | |
| gMin5LCC<-simplify(gMin5LCC,remove.loops=TRUE,remove.multiple=TRUE,edge.attr.comb="sum") | |
| summary(gMin5LCC) | |
| gMin5LCCUndirected<-as.undirected(gMin5LCC, mode = "collapse") | |
| summary(gMin5LCCUndirected) | |
| summary(V(gMin5LCCUndirected)$majLangPercentAdj) | |
| length(V(gMin5LCCUndirected)$majLangPercentAdj) | |
| length(V(gMin5LCCUndirected)$majLangPercentAdj[V(gMin5LCCUndirected)$majLangPercentAdj==1]) | |
| length(V(gMin5LCCUndirected)$majLangPercentAdj[V(gMin5LCCUndirected)$majLangPercentAdj!=1]) | |
| length(V(gMin5LCCUndirected)$majLangPercentAdj[V(gMin5LCCUndirected)$majLangPercentAdj!=1])/length(V(gMin5LCCUndirected)$majLangPercentAdj) | |
| ############################## | |
| ## Summary ## | |
| ############################### | |
| write.graph(gMin5LCC,"edges.list",format="edgelist") | |
| write(V(gMin5LCC)$majLangAdj,"langclean.txt") | |
| write.csv(V(gMin5LCC)$label,"usernames.txt",row.names=FALSE)#Untested | |
| V(gMin5LCC)$inDegree<-degree(gMin5LCC, mode = "in") | |
| V(gMin5LCC)$outDegree<-degree(gMin5LCC, mode = "out") | |
| V(gMin5LCC)$constraint<-constraint(gMin5LCC) | |
| #V(gMin5LCC)$betweenness<-betweenness(gMin5LCC) | |
| V(gMin5LCC)$multilingual[V(gMin5LCC)$majLangPercentAdj!=1]<-"Multilingual" | |
| V(gMin5LCC)$multilingual[V(gMin5LCC)$majLangPercentAdj==1]<-"Monolingual" | |
| #V(gMin5LCCUndirected)$multilingual<-as.factor(V(gMin5LCCUndirected)$multilingual) | |
| table(V(gMin5LCC)$multilingual) | |
| length(V(gMin5LCC)$multilingual) | |
| sumStats<-data.frame(multilingual=V(gMin5LCC)$multilingual, | |
| tweetCount=V(gMin5LCC)$tweetCountAdj, | |
| inDegree=V(gMin5LCC)$inDegree, | |
| outDegree=V(gMin5LCC)$outDegree, | |
| majLangCount=V(gMin5LCC)$majLangCountAdj, | |
| constraint=V(gMin5LCC)$constraint#,betweenness=V(gMin5LCC)$betweenness | |
| ) | |
| #TODO: Remove outliers? | |
| svg("tweetCount.svg",width=4,height=4) | |
| plot <- ggplot(sumStats,aes(x=multilingual,y=tweetCount)) | |
| plot <- plot + geom_boxplot() + scale_y_log10("Number of tweets") | |
| plot <- plot + scale_x_discrete("") + theme_bw() | |
| plot | |
| dev.off() | |
| svg("inDegree.svg",width=4,height=4) | |
| plot <- ggplot(sumStats,aes(x=multilingual,y=inDegree)) | |
| plot <- plot + geom_boxplot() + scale_y_log10("In-degree") | |
| plot <- plot + scale_x_discrete("") + theme_bw() | |
| plot | |
| dev.off() | |
| svg("outDegree.svg",width=4,height=4) | |
| plot <- ggplot(sumStats,aes(x=multilingual,y=outDegree)) | |
| plot <- plot + geom_boxplot() + scale_y_log10("Out-degree") | |
| plot <- plot + scale_x_discrete("") + theme_bw() | |
| plot | |
| dev.off() | |
| svg("majLangCount.svg",width=4,height=4) | |
| plot <- ggplot(sumStats,aes(x=multilingual,y=majLangCount)) | |
| plot <- plot + geom_boxplot() + scale_y_log10("Number of tweets in majority language") | |
| plot <- plot + scale_x_discrete("") + theme_bw() | |
| plot | |
| dev.off() | |
| ggplot(sumStats,aes(x=tweetCount,color=multilingual)) + geom_density() + scale_x_log10() | |
| #TODO: Different linetypes for split variable??? (e.g., dashed for mono, solid for multi) | |
| pTweetCount<-density.log(sumStats,"tweetCount","multilingual",n=2048,adjust=8,title="Number of tweets") | |
| pIndegree<-density.log(sumStats,"inDegree","multilingual",n=2048,adjust=8,title="In-degree/visibility") | |
| pOutdegree<-density.log(sumStats,"outDegree","multilingual",n=2048,adjust=8,title="Out-degree/reach") | |
| pConstraint<-density.log(sumStats,"constraint","multilingual",n=2048,adjust=8,title="Burt's constraint") | |
| #pBetweenness<-density.log(sumStats,"betweenness","multilingual",n=2048,adjust=8,title="Betweenness") | |
| tmp <- ggplot_gtable(ggplot_build(pTweetCount$plot)) | |
| leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box") | |
| leg <- tmp$grobs[[leg]] | |
| lheight <- sum(leg$height) | |
| svg("introSummary2.svg",width=12,height=5) | |
| grid.arrange( | |
| pTweetCount$plot+theme(legend.position="none")+scale_x_log10("Number of tweets", | |
| breaks = c(1,10,100), | |
| labels = trans_format("log10", math_format(10^.x))), | |
| pOutdegree$plot+theme(legend.position="none")+scale_x_log10("Out-degree/reach", | |
| breaks = c(1,10,100), | |
| labels = trans_format("log10", math_format(10^.x))), | |
| pIndegree$plot+theme(legend.position="none")+scale_x_log10("In-degree/visibility", | |
| breaks = c(1,10,100,1000,10000), | |
| labels = trans_format("log10", math_format(10^.x))), | |
| #Row 2 | |
| textGrob(""), leg, ncol=3,ncol=1,heights=unit.c(unit(1, "npc") - lheight, lheight) | |
| ) | |
| dev.off() | |
| svg("introSummary.svg",width=12,height=5) | |
| grid.arrange( | |
| pTweetCount$plot+theme(legend.position="none"), | |
| pOutdegree$plot+theme(legend.position="none"), | |
| pIndegree$plot+theme(legend.position="none"), | |
| #Row 2 | |
| textGrob(""), leg, ncol=3,ncol=1,heights=unit.c(unit(1, "npc") - lheight, lheight) | |
| ) | |
| dev.off() | |
| #plot <- ggplot(subset(sumStats,tweetCount>=4),aes(x=tweetCount,y=inDegree,group=multilingual,color=multilingual)) | |
| #plot <- plot + geom_point() + scale_y_log10("Number of tweets in majority language") | |
| #plot <- plot + scale_x_log10("") + theme_bw() | |
| #plot <- ggplot(subset(sumStats,multilingual=="Multilingual"),aes(x=majLangPercent)) + geom_hist() | |
| summary(sumStats$tweetCount[sumStats$multilingual=="Monolingual"]) | |
| sd(sumStats$tweetCount[sumStats$multilingual=="Monolingual"]) | |
| summary(sumStats$tweetCount[sumStats$multilingual=="Multilingual"]) | |
| sd(sumStats$tweetCount[sumStats$multilingual=="Multilingual"]) | |
| t.test(sumStats$tweetCount[sumStats$multilingual=="Monolingual"],sumStats$tweetCount[sumStats$multilingual=="Multilingual"]) | |
| wilcox.test(sumStats$tweetCount~sumStats$multilingual) | |
| kruskal.test(sumStats$tweetCount~sumStats$multilingual) | |
| summary(sumStats$outDegree[sumStats$multilingual=="Monolingual"]) | |
| sd(sumStats$outDegree[sumStats$multilingual=="Monolingual"]) | |
| summary(sumStats$outDegree[sumStats$multilingual=="Multilingual"]) | |
| sd(sumStats$outDegree[sumStats$multilingual=="Multilingual"]) | |
| t.test(sumStats$outDegree[sumStats$multilingual=="Monolingual"],sumStats$outDegree[sumStats$multilingual=="Multilingual"]) | |
| wilcox.test(sumStats$outDegree~sumStats$multilingual) | |
| kruskal.test(sumStats$outDegree~sumStats$multilingual) | |
| summary(sumStats$inDegree[sumStats$multilingual=="Monolingual"]) | |
| sd(sumStats$inDegree[sumStats$multilingual=="Monolingual"]) | |
| summary(sumStats$inDegree[sumStats$multilingual=="Multilingual"]) | |
| sd(sumStats$inDegree[sumStats$multilingual=="Multilingual"]) | |
| t.test(sumStats$inDegree[sumStats$multilingual=="Monolingual"],sumStats$inDegree[sumStats$multilingual=="Multilingual"]) | |
| wilcox.test(sumStats$inDegree~sumStats$multilingual) | |
| kruskal.test(sumStats$inDegree~sumStats$multilingual) | |
| summary(sumStats$constraint[sumStats$multilingual=="Monolingual"]) | |
| summary(sumStats$constraint[sumStats$multilingual=="Multilingual"]) | |
| #summary(sumStats$betweenness[sumStats$multilingual=="Monolingual"]) | |
| #summary(sumStats$betweenness[sumStats$multilingual=="Multilingual"]) | |
| ######################## | |
| ## Node removal -- This analysis is not included in the published paper. See Node removal simple. | |
| ######################## | |
| #see games.c | |
| #gRand<-rewire(gMin5LCCUndirected,mode="simple",niter=1000) | |
| #OR | |
| #Copy majLangPercentRank across based on degreeRank | |
| #V(gMin5LCCUndirected)$degreeRank<-rank(V(gMin5LCCUndirected)$degree,ties.method="random") | |
| #V(gRand)$degree<-degree(gRand) | |
| #V(gRand)$degreeRank<-rank(V(gRand)$degree,ties.method="random") | |
| #ranks<-V(gMin5LCCUndirected)$majLangPercentRank[order(V(gMin5LCCUndirected)$degreeRank)] | |
| #V(gRand)$majLangPercentRank<-ranks[order(V(gRand)$degreeRank)] | |
| V(gMin5LCCUndirected)$multilingual[V(gMin5LCCUndirected)$majLangPercentAdj!=1]<-"Multilingual" | |
| V(gMin5LCCUndirected)$multilingual[V(gMin5LCCUndirected)$majLangPercentAdj==1]<-"Monolingual" | |
| #TODO: Consider comparing to attack algorithm -- e.g. remove most central nodes first (by k-cores?) | |
| # Betweenness centrality does not work as it changes as nodes are removed | |
| # Could consdier comparing to removing highest degree first -- Did this (I think): multilinguals created less components than highest degree first | |
| runNodeRemoval <- function(vals) { | |
| # as many rows & cols as needed; don't know levels yet | |
| GRAPHSIZE<-length(V(gMin5LCCUndirected)) | |
| #vals<-seq(0.01,0.99,0.03)#c(.01,.02,.03,.04,.05,.06,.07,.08,.09,.1,.2,.3,.4,.5,.6,.7,.8,.9) | |
| #vals<-seq(0.01,0.1,0.005) | |
| N <- length(vals)*3 | |
| df <- data.frame(val=rep(NA, N), components=rep(NA, N), lccSize=rep(NA, N), avgSize=rep(NA, N), constraint=rep(NA, N), numEdges=rep(NA,N), numNodes=rep(NA,N), method=rep("", N), stringsAsFactors=FALSE) | |
| i<-0 | |
| #Multilinguals first | |
| for (x in vals) { | |
| print(x) | |
| y<-x*GRAPHSIZE | |
| tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$majLangPercentRank>=y) | |
| #print(summary(tmp)) | |
| #clusters<-fastgreedy.community(tmp) | |
| components<-clusters(tmp,mode="weak") | |
| #print(components$no) | |
| m<-max(components$csize) | |
| i<-i+1 | |
| df[i, ] <- c(x, components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "multiFirst") | |
| } | |
| #Monolinguals first | |
| for (x in vals) { | |
| print(x) | |
| y<-(1-x)*GRAPHSIZE | |
| tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$majLangPercentRank<=y) | |
| #print(summary(tmp)) | |
| #clusters<-fastgreedy.community(tmp) | |
| components<-clusters(tmp,mode="weak") | |
| #print(components$no) | |
| m<-max(components$csize) | |
| i<-i+1 | |
| df[i, ] <- c(x, components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "monoFirst") | |
| } | |
| #Randomly permute majLangPercent and repeat the above | |
| V(gMin5LCCUndirected)$permuted<-sample(V(gMin5LCCUndirected)$majLangPercentRank) | |
| for (x in vals) { | |
| print(x) | |
| y<-x*GRAPHSIZE | |
| tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$permuted>=y) | |
| #print(summary(tmp)) | |
| components<-clusters(tmp,mode="weak") | |
| #print(components$no) | |
| m<-max(components$csize) | |
| i<-i+1 | |
| df[i, ] <- c(x, components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "random") | |
| } | |
| #Compare to gRand | |
| for (x in vals) { | |
| print(x) | |
| y<-x*GRAPHSIZE | |
| tmp<-induced.subgraph(gRand,V(gRand)$majLangPercentRank>=y) | |
| #print(summary(tmp)) | |
| #clusters<-fastgreedy.community(tmp) | |
| components<-clusters(tmp,mode="weak") | |
| #print(components$no) | |
| m<-max(components$csize) | |
| i<-i+1 | |
| df[i, ] <- c(x, components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "random-rewire") | |
| } | |
| #High degree first | |
| #V(gMin5LCCUndirected)$permuted<-sample(V(gMin5LCCUndirected)$majLangPercentRank) | |
| #for (x in vals) { | |
| # print(x) | |
| # y<-x*GRAPHSIZE | |
| # tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$degreeRank>=y) | |
| # print(summary(tmp)) | |
| # components<-clusters(tmp,mode="weak") | |
| # print(components$no) | |
| # m<-max(components$csize) | |
| # i<-i+1 | |
| # df[i, ] <- c(x, components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), "highFirst") | |
| #} | |
| return(df) | |
| } | |
| if (NUM.CORES>1) { | |
| cl <- makeCluster(NUM.CORES) | |
| registerDoParallel(cl, cores = NUM.CORES) | |
| } | |
| dfAvg <- foreach(run = 1:NUM.RUNS, .packages = c("igraph"), | |
| .combine = rbind) %dopar% { | |
| print(paste0("Run: ",run)) | |
| GRAPHSIZE<-length(V(gMin5LCCUndirected)) | |
| V(gMin5LCCUndirected)$degree<-degree(gMin5LCCUndirected, mode = "all") | |
| V(gMin5LCCUndirected)$majLangPercentRank<-rank(V(gMin5LCCUndirected)$majLangPercentAdj,ties.method="random") | |
| gRand<-degree.sequence.game(degree(gMin5LCCUndirected),method="vl") | |
| V(gRand)$majLangPercentRank<-V(gMin5LCCUndirected)$majLangPercentRank | |
| df<-runNodeRemoval(seq(0.01,0.99,0.01)) | |
| df$edgesRemoved<-ecount(gMin5LCCUndirected)-as.numeric(df$numEdges) | |
| write.csv(df,paste0("output_df_randomOrder_withCounts_",run,".csv")) | |
| return(df) | |
| } | |
| warnings() | |
| if (NUM.CORES>1) { | |
| stopCluster(cl) | |
| } | |
| tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$multilingual=="Monolingual") | |
| 1-ecount(tmp)/ecount(gMin5LCCUndirected) | |
| tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$majLangPercentAdj>=.6) | |
| 1-ecount(tmp)/ecount(gMin5LCCUndirected) | |
| sum(V(gMin5LCCUndirected)$multilingual=="Multilingual")/vcount(gMin5LCCUndirected) | |
| sum(V(gMin5LCCUndirected)$majLangPercentAdj<.6)/vcount(gMin5LCCUndirected) | |
| #dfZoom<-runNodeRemoval(seq(0.01,0.2,0.01)) | |
| #write.csv(dfZoom,"output_dfZoom.csv") | |
| #df<-read.csv("output2.csv") | |
| df<-dfAvg | |
| df$val<-as.numeric(df$val) | |
| df$components<-as.numeric(df$components) | |
| df$lccSize<-as.numeric(df$lccSize) | |
| df$avgSize<-as.numeric(df$avgSize) | |
| df$constraint<-as.numeric(df$constraint) | |
| df$numEdges<-as.numeric(df$numEdges) | |
| df$numNodes<-as.numeric(df$numNodes) | |
| df$edgesRemoved<-as.numeric(df$edgesRemoved) | |
| df<-ddply(df,.(val,method),summarize, | |
| components=mean(components),lccSize=mean(lccSize), | |
| avgSize=mean(avgSize),constraint=mean(constraint), | |
| numEdges=mean(numEdges),numNodes=mean(numNodes),edgesRemoved=mean(edgesRemoved)) | |
| write.csv(df,"output_df_randomOrder_withCounts_avg100.csv") | |
| png("no_components.png",width=pngWidth,height=pngHeight) | |
| plot<-ggplot(df,aes(x=val,y=components,color=method,group=method)) | |
| plot<-plot + geom_line() + scale_color_brewer(type="qual") + scale_x_continuous(labels=percent) | |
| plot | |
| dev.off() | |
| png("lccSize.png",width=pngWidth,height=pngHeight) | |
| plot<-ggplot(df,aes(x=val,y=lccSize,color=method,group=method)) | |
| plot<-plot + geom_line() + scale_color_brewer(type="qual") + scale_x_continuous(labels=percent) | |
| plot | |
| dev.off() | |
| png("avgSize.png",width=pngWidth,height=pngHeight) | |
| plot<-ggplot(df,aes(x=val,y=avgSize,color=method,group=method)) | |
| plot<-plot + geom_line() + scale_color_brewer(type="qual") + scale_x_continuous(labels=percent) | |
| plot | |
| dev.off() | |
| png("constraint.png",width=pngWidth,height=pngHeight) | |
| plot<-ggplot(df,aes(x=val,y=constraint,color=method,group=method)) | |
| plot<-plot + geom_line() | |
| plot | |
| dev.off() | |
| ######################## | |
| ## Node removal simple -- This is included | |
| ######################## | |
| runNodeRemovalSimple <- function(gMin5LCCUndirected,gRand) { | |
| nodesToRemove<-length(V(gMin5LCCUndirected)[V(gMin5LCCUndirected)$majLangPercentAdj<1]) | |
| GRAPHSIZE<-length(V(gMin5LCCUndirected)) | |
| N <- 4 | |
| df <- data.frame(components=rep(NA, N), lccSize=rep(NA, N), avgSize=rep(NA, N), constraint=rep(NA, N), numEdges=rep(NA,N), numNodes=rep(NA,N), method=rep("", N), stringsAsFactors=FALSE) | |
| i<-0 | |
| print("Multilinguals (all removed)") | |
| tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$majLangPercentAdj==1) | |
| components<-clusters(tmp,mode="weak") | |
| m<-max(components$csize) | |
| i<-i+1 | |
| df[i, ] <- list(components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "multi") | |
| print("Monolinguals (random subset)") | |
| tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$majLangPercentRank<=(GRAPHSIZE-nodesToRemove)) | |
| components<-clusters(tmp,mode="weak") | |
| m<-max(components$csize) | |
| i<-i+1 | |
| df[i, ] <- list(components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "mono") | |
| #Randomly permute majLangPercent and repeat the above | |
| V(gMin5LCCUndirected)$permuted<-sample(V(gMin5LCCUndirected)$majLangPercentRank) | |
| print("Random subset") | |
| tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$permuted>nodesToRemove) | |
| components<-clusters(tmp,mode="weak") | |
| m<-max(components$csize) | |
| i<-i+1 | |
| df[i, ] <- list(components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "random") | |
| print("Multilinguals (all, edge rewiring)") | |
| tmp<-induced.subgraph(gRand,V(gRand)$majLangPercentAdj==1) | |
| components<-clusters(tmp,mode="weak") | |
| m<-max(components$csize) | |
| i<-i+1 | |
| df[i, ] <- list(components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "multiRewire") | |
| print("Highest degree subset") | |
| tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$degreeRank>nodesToRemove) | |
| components<-clusters(tmp,mode="weak") | |
| m<-max(components$csize) | |
| i<-i+1 | |
| df[i, ] <- list(components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "highDegree") | |
| return(df) | |
| } | |
| if (NUM.CORES>1) { | |
| cl <- makeCluster(NUM.CORES) | |
| registerDoParallel(cl, cores = NUM.CORES) | |
| } | |
| dfNodeRemoval <- foreach(run = 1:NUM.RUNS, .packages = c("igraph"), | |
| .combine = rbind) %dopar% { | |
| print(paste0("Run: ",run)) | |
| GRAPHSIZE<-length(V(gMin5LCCUndirected)) | |
| V(gMin5LCCUndirected)$degree<-degree(gMin5LCCUndirected, mode = "all") | |
| V(gMin5LCCUndirected)$degreeRank<-GRAPHSIZE-rank(V(gMin5LCCUndirected)$degree,ties.method="random") | |
| V(gMin5LCCUndirected)$majLangPercentRank<-rank(V(gMin5LCCUndirected)$majLangPercentAdj,ties.method="random") | |
| gRand<-degree.sequence.game(degree(gMin5LCCUndirected),method="vl") | |
| V(gRand)$majLangPercentRank<-V(gMin5LCCUndirected)$majLangPercentRank | |
| V(gRand)$majLangPercentAdj<-V(gMin5LCCUndirected)$majLangPercentAdj | |
| df<-runNodeRemovalSimple(gMin5LCCUndirected,gRand) | |
| df$edgesRemoved<-ecount(gMin5LCCUndirected)-as.numeric(df$numEdges) | |
| df$run<-run | |
| write.csv(df,paste0("output_dfNodeRemovalSimple_",run,".csv")) | |
| return(df) | |
| } | |
| write.csv(dfNodeRemoval,"dfNodeRemovalSimple.csv") | |
| warnings() | |
| if (NUM.CORES>1) { | |
| stopCluster(cl) | |
| } | |
| conf95<-function(df,var) { | |
| m<-mean(df[,var]) | |
| s<-sd(df[,var]) | |
| tmp<-data.frame( | |
| m, | |
| s, | |
| m+(1.96*s), | |
| m-(1.96*s), | |
| max(df[,var]), | |
| min(df[,var]) | |
| ) | |
| eval(names(tmp)<-c( | |
| paste0(var,"Mean"),paste0(var,"SD"), | |
| paste0(var,"95Max"),paste0(var,"95Min"), | |
| paste0(var,"Max"),paste0(var,"Min") | |
| )) | |
| return(tmp) | |
| } | |
| meanSD<-function(df) { | |
| tmp<-data.frame( | |
| lccSize=mean(df$lccSize),lccSizeSD=sd(df$lccSize), | |
| avgSize=mean(df$avgSize),avgSizeSD=sd(df$avgSize), | |
| components=mean(df$components),componentsSD=sd(df$components) | |
| ) | |
| lccSize<-conf95(df,"lccSize") | |
| avgSize<-conf95(df,"avgSize") | |
| components<-conf95(df,"components") | |
| #print(tmp) | |
| return(cbind(lccSize,avgSize,components)) | |
| } | |
| #dfNodeRemoval<-read.csv("dfNodeRemovalSimple.csv") | |
| dfNodeRemovalAvg<-ddply(dfNodeRemoval,.(method),meanSD) | |
| dfNodeRemovalAvg | |
| dfNodeRemovalAvg<-subset(dfNodeRemovalAvg,method!="highDegree") | |
| #bar chart | |
| ggplot(dfNodeRemovalAvg,aes(x=method,y=componentsMean)) + geom_bar(stat="identity",alpha=0.75) + geom_errorbar(aes(ymax=components95Max,ymin=components95Min)) + theme_bw() | |
| ggplot(dfNodeRemovalAvg,aes(x=method,y=lccSizeMean)) + geom_bar(stat="identity",alpha=0.75) + geom_errorbar(aes(ymax=lccSize95Max,ymin=lccSize95Min)) + theme_bw() | |
| ggplot(dfNodeRemovalAvg,aes(x=method,y=avgSizeMean)) + geom_bar(stat="identity",alpha=0.75) + geom_errorbar(aes(ymax=avgSize95Max,ymin=avgSize95Min)) + theme_bw() | |
| #boxplots | |
| dfNodeRemoval<-subset(dfNodeRemoval,method!="highDegree") | |
| dfNodeRemoval$label<-factor(dfNodeRemoval$method,levels=c("multi","mono","random","multiRewire"),labels=c("Multilinguals","\nMonolinguals","Random","\nMultilinguals\n(edges rewired)")) | |
| dfNodeRemovalAvg$label<-factor(dfNodeRemovalAvg$method,levels=c("multi","mono","random","multiRewire"),labels=c("Multilinguals","\nMonolinguals","Random","\nMultilinguals\n(edges rewired)")) | |
| #xscale<-scale_x_discrete("", | |
| # breaks=c("multi","mono","multiRewire","random"), | |
| # labels=c("Multilinguals","\nMonolinguals","\nMultilinguals\n(Edges rewired)","Random") | |
| #) | |
| xscale<-scale_x_discrete("") | |
| cPlot<-ggplot(dfNodeRemoval,aes(x=label,y=components))+geom_boxplot()+theme_bw() | |
| cPlot<-cPlot+xscale+scale_y_continuous("Number of components",labels=comma) | |
| cPlot<-cPlot+geom_point(data=dfNodeRemovalAvg,aes(x=label,y=componentsMean),shape=3)#3=+, 4=* | |
| lccPlot<-ggplot(dfNodeRemoval,aes(x=label,y=lccSize))+geom_boxplot()+theme_bw() | |
| lccPlot<-lccPlot+xscale+scale_y_continuous("Size of LCC",labels=comma) | |
| lccPlot<-lccPlot+geom_point(data=dfNodeRemovalAvg,aes(x=label,y=lccSizeMean),shape=3) | |
| avgPlot<-ggplot(dfNodeRemoval,aes(x=label,y=avgSize))+geom_boxplot()+theme_bw() | |
| avgPlot<-avgPlot+xscale+scale_y_continuous("Average component size (not including LCC)",labels=comma) | |
| avgPlot<-avgPlot+geom_point(data=dfNodeRemovalAvg,aes(x=label,y=avgSizeMean),shape=3) | |
| svg("nodeRemovalSimple.svg",width=12,height=5) | |
| grid.arrange(lccPlot,cPlot,avgPlot,nrow=1,ncol=3) | |
| dev.off() | |
| #Addl -- compare % left in LCC to having removed all speakers of a given language (dfRemoveLangs) | |
| nodesToRemove<-length(V(gMin5LCCUndirected)[V(gMin5LCCUndirected)$majLangPercentAdj<1]) | |
| dfNodeRemovalAvg$lccPercent<-1-(dfNodeRemovalAvg$lccSizeMean/(length(V(gMin5LCC))-nodesToRemove)) | |
| dfNodeRemovalAvg[,c("method","lccPercent")] | |
| dfRemoveLangs[dfRemoveLangs=="en","lccFrac"] | |
| ############################### | |
| ## Language analysis - remove nodes from one language at a time | |
| ############################## | |
| #\label{tbl:topLangs} | |
| #TOP Langs: Table tbl:topLangs | |
| sort(table(V(gMin5LCCUndirected)$majLangAdj)) | |
| langs<-c("en","ja","pt","ms","es","nl","ko","th") | |
| #Avg. tweets per lang | |
| dfTmp<-data.frame(majLangAdj=V(gMin5LCCUndirected)$majLangAdj,tweetCount=V(gMin5LCCUndirected)$tweetCount) | |
| dfTmp<-ddply(dfTmp,.(majLangAdj),summarize,meanTweets=round(mean(tweetCount),2),tweetSD=round(sd(tweetCount),2),tweetMin=min(tweetCount),tweetMax=max(tweetCount),tweetMed=median(tweetCount),sumTweets=sum(tweetCount),numUsers=length(tweetCount)) | |
| dfTmp[order(dfTmp$numUsers,decreasing=TRUE),c("majLangAdj","numUsers","meanTweets","tweetSD","tweetMin","tweetMax","tweetMed")] | |
| print(dfTmp[order(dfTmp$numUsers,decreasing=TRUE),c("majLangAdj","numUsers","meanTweets","tweetSD")],row.names=FALSE) | |
| GRAPHSIZE<-length(V(gMin5LCCUndirected)) | |
| vals<-seq(0.01,0.3,0.01) | |
| N <- length(vals)*(length(langs)+1) | |
| if (NUM.CORES>1) { | |
| cl <- makeCluster(NUM.CORES) | |
| registerDoParallel(cl, cores = NUM.CORES) | |
| } | |
| dfLang100 <- foreach(run = 1:NUM.RUNS, .packages = c("igraph"), | |
| .combine = rbind) %dopar% { | |
| dfLang <- data.frame(val=rep(NA, N), components=rep(NA, N), lccSize=rep(NA, N), avgSize=rep(NA, N), lang=rep("", N),stringsAsFactors=FALSE) | |
| i<-0 | |
| print(paste0("Run: ",run)) | |
| V(gMin5LCCUndirected)$majLangPercentRank<-rank(V(gMin5LCCUndirected)$majLangPercentAdj,ties.method="random") | |
| V(gMin5LCCUndirected)$majLangPercentRankRev<-GRAPHSIZE-V(gMin5LCCUndirected)$majLangPercentRank | |
| for (lang in langs) { | |
| ranks<-sort(V(gMin5LCCUndirected)$majLangPercentRankRev[V(gMin5LCCUndirected)$majLangAdj==lang]) | |
| for (x in vals) { | |
| print(paste(lang,x)) | |
| y<-x*GRAPHSIZE | |
| y<-ranks[min(y,length(ranks))] | |
| tmp<-induced.subgraph(gMin5LCCUndirected, | |
| V(gMin5LCCUndirected)$majLangAdj!=lang | V(gMin5LCCUndirected)$majLangPercentRankRev>=y) | |
| #print(summary(tmp)) | |
| components<-clusters(tmp,mode="weak") | |
| #print(components$no) | |
| m<-max(components$csize) | |
| i<-i+1 | |
| dfLang[i, ] <- c(x, components$no, m, mean(components$csize[components$csize!=m]), lang) | |
| } | |
| } | |
| #Randomly permute majLangPercentRev and repeat the above | |
| V(gMin5LCCUndirected)$permuted<-sample(V(gMin5LCCUndirected)$majLangPercentRankRev) | |
| for (x in vals) { | |
| print(paste("random",x)) | |
| y<-x*GRAPHSIZE | |
| tmp<-induced.subgraph(gMin5LCCUndirected, | |
| V(gMin5LCCUndirected)$permuted>=y) | |
| #print(summary(tmp)) | |
| components<-clusters(tmp,mode="weak") | |
| #print(components$no) | |
| m<-max(components$csize) | |
| i<-i+1 | |
| dfLang[i, ] <- c(x, components$no, m, mean(components$csize[components$csize!=m]), "random") | |
| } | |
| write.csv(dfLang,paste0("dfLang_run_",run,".csv")) | |
| return(dfLang) | |
| } | |
| dfLang<-dfLang100 | |
| dfLang$val<-as.numeric(dfLang$val) | |
| dfLang$components<-as.numeric(dfLang$components) | |
| dfLang$lccSize<-as.numeric(dfLang$lccSize) | |
| dfLang$avgSize<-as.numeric(dfLang$avgSize) | |
| write.csv(dfLang,"dfLangAllRuns.csv") | |
| dfLang<-ddply(dfLang,.(val,lang),summarize, | |
| components=mean(components),lccSize=mean(lccSize), | |
| avgSize=mean(avgSize)) | |
| write.csv(dfLang,"dfLang_avg100.csv") | |
| warnings() | |
| if (NUM.CORES>1) { | |
| stopCluster(cl) | |
| } | |
| #scale_color_brewer(type="qual") | |
| dfLang<-subset(dfLang,lang %in% langs) | |
| dfLang$lang[dfLang$lang=="random"]<-"Random" | |
| svg("lang_no_components.svg",width=svgWidth,height=svgHeight) | |
| plot<-ggplot(dfLang,aes(x=val,y=components,color=lang,group=lang)) | |
| plot<-plot + geom_point() + geom_line(size=1) + scale_color_discrete("Language")#,type="qual") | |
| plot<-plot + scale_x_continuous('% of users removed',labels=percent) | |
| plot<-plot + scale_y_continuous("Number of components",labels=comma) | |
| plot<-plot + theme_bw() + | |
| theme(legend.title=element_text(size=18),legend.text=element_text(size=16), | |
| axis.title.x=element_text(size=18),axis.text.x=element_text(size=16), | |
| axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none") | |
| plot<-plot + geom_text(aes(x=.31,y=components,color=lang,label=lang),data=subset(dfLang,val==0.3)) | |
| plot | |
| dev.off() | |
| dfEnding<-subset(dfLang,val==.3) | |
| dfEnding[order(dfEnding$components), ] | |
| svg("lang_lccSize.svg",width=svgWidth,height=svgHeight) | |
| plot<-ggplot(dfLang,aes(x=val,y=lccSize,color=lang,group=lang)) | |
| plot<-plot + geom_point() + geom_line(size=1) + scale_color_discrete("Language")#,type="qual") | |
| plot<-plot + scale_x_continuous('% of users removed',labels=percent) | |
| plot<-plot + scale_y_continuous("Size of largest connected component",labels=comma) | |
| plot<-plot + theme_bw() + | |
| theme(legend.title=element_text(size=18),legend.text=element_text(size=16), | |
| axis.title.x=element_text(size=18),axis.text.x=element_text(size=16), | |
| axis.title.y=element_text(size=18),axis.text.y=element_text(size=16)) | |
| plot | |
| dev.off() | |
| #TODO: Look at direct.label | |
| table(dfLang$lang[dfLang$val==.3],dfLang$components[dfLang$val==.3]) | |
| #Ending num. components for each language (val==.3) | |
| #en 48,190 | |
| #es 1,923 | |
| #in 6,864 | |
| #ja 385 | |
| #ko 184 | |
| #ms 8,626 | |
| #nl 965 | |
| #pt 2,211 | |
| #random 67,206 | |
| V(gMin5LCCUndirected)$constraint<-constraint(gMin5LCCUndirected) | |
| dfGraph<-data.frame( | |
| majLangPercentAdj=V(gMin5LCCUndirected)$majLangPercentAdj, | |
| tweetCount=V(gMin5LCCUndirected)$tweetCountAdj, | |
| inDegree=V(gMin5LCCUndirected)$inDegree, | |
| constraint=V(gMin5LCCUndirected)$constraint | |
| ) | |
| #More graphs | |
| png("tweetCount_majLangPercent.png",width=pngWidth,height=pngHeight) | |
| plot <- ggplot(dfGraph,aes(x=majLangPercentAdj,y=tweetCount)) + geom_point() | |
| plot | |
| dev.off() | |
| png("inDegree_majLangPercent.png",width=pngWidth,height=pngHeight) | |
| plot <- ggplot(dfGraph,aes(x=majLangPercentAdj,y=inDegree)) + geom_point() | |
| plot | |
| dev.off() | |
| png("constraint_majLangPercent.png",width=pngWidth,height=pngHeight) | |
| plot <- ggplot(dfGraph,aes(x=majLangPercentAdj,y=constraint)) + geom_point() | |
| plot | |
| dev.off() | |
| V(gMin5LCCUndirected)$label[V(gMin5LCCUndirected)$inDegree>40000]#justinbieber | |
| V(gMin5LCCUndirected)$label[V(gMin5LCCUndirected)$inDegree>10000]#SlGNO | |
| #Remove nodes from lang and calculate fraction of reminaing nodes in LCC | |
| langs<-as.data.frame(table(V(gMin5LCCUndirected)$majLangAdj)) | |
| langs<-subset(langs,langs$Freq>1000)$Var1 | |
| N<-length(langs) | |
| dfRemoveLangs<-data.frame(lang=rep("",N),components=rep(NA,N),lccSize=rep(NA,N), | |
| numNodes=rep(NA,N),num2=rep(NA,N),stringsAsFactors=FALSE) | |
| i<-0 | |
| for (lang in langs) { | |
| tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$majLangAdj!=lang) | |
| components<-clusters(tmp,mode="weak") | |
| m<-max(components$csize) | |
| i<-i+1 | |
| dfRemoveLangs[i, ] <- list(lang, components$no, m, length(V(tmp)), sum(components$csize[components$csize!=m])) | |
| } | |
| dfRemoveLangs$numNodes==dfRemoveLangs$num2+dfRemoveLangs$lccSize | |
| dfRemoveLangs$lccFrac<-(dfRemoveLangs$numNodes-dfRemoveLangs$lccSize)/dfRemoveLangs$numNodes | |
| svg("lang_removed.svg",width=svgWidth,height=svgHeight) | |
| plot<-ggplot(dfRemoveLangs,aes(x=lang,y=lccFrac)) | |
| plot<-plot + geom_bar(stat="identity") | |
| plot<-plot + scale_y_continuous('% of nodes not in largest-connected component',labels=percent) | |
| plot<-plot + scale_x_discrete("Language removed") | |
| plot<-plot + theme_bw() + | |
| theme(legend.title=element_text(size=18),legend.text=element_text(size=16), | |
| axis.title.x=element_text(size=18),axis.text.x=element_text(size=16), | |
| axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none") | |
| plot | |
| dev.off() | |
| tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$majLangAdj!="ms" & V(gMin5LCCUndirected)$majLangAdj!="in") | |
| components<-clusters(tmp,mode="weak") | |
| m<-max(components$csize) | |
| (length(V(tmp))-m)/length(V(tmp)) | |
| ################################################### | |
| ## Q1: Is language homophilous? ## | |
| ################################################### | |
| #take largest *strongly* connected component? | |
| #labelPropComm<-label.propagation.community(gMin5LCCUndirected) | |
| #write.csv(labelPropComm$membership,"labelPropCommMembership.csv") | |
| commsFile<-"iter72memberships_smart.txt" #This file comes from the label propagation code in Java | |
| if (!is.na(commsFile)){ | |
| memberships<-read.csv(commsFile,header=FALSE,sep=" ") | |
| #Begin Untested code | |
| tmp<-read.csv("usernames.txt",header=FALSE) | |
| sum(tmp!=V(gMin5LCCUndirected)$label)#This should be 0, else the lists are malaligned! | |
| #End untested code | |
| modularity(gMin5LCCUndirected,memberships$V2) | |
| #[1] 0.7471879 | |
| #in-ms: 0.8054294 | |
| N <- max(memberships$V2) | |
| dfComms <- data.frame(num=seq(1, N), size=rep(0, N), numLangs=rep(0, N), majLangCount=rep(0,N), majLang=rep("", N),stringsAsFactors=FALSE) | |
| for (comm in seq(1,N)) { | |
| size<-length(memberships$V2[memberships$V2==comm]) | |
| langs<-V(gMin5LCCUndirected)$majLangAdj[memberships$V2==comm] | |
| numLangs<-length(table(langs)) | |
| majLang <- unique(langs) | |
| majLang <- majLang[which.max(tabulate(match(langs,majLang)))] | |
| majLangCount<-sum(langs==majLang) | |
| dfComms[comm, ] <- list(comm, size, numLangs, majLangCount, majLang) | |
| } | |
| write.csv(dfComms,"dfCommunities.csv") | |
| #dfComms<-read.csv("community_info_iter36.csv") | |
| #dfComms$label<-dfComms$label+1 | |
| sum(dfComms$numLangs==1) | |
| sum(dfComms$numLangs==1)/length(dfComms$numLangs) #A large number of components have one language only | |
| sum(dfComms$size[dfComms$numLangs==1])/sum(dfComms$size) #But these components represent a small number of users | |
| #Five large components have 61% of users, #\label{tbl:topCommunities} | |
| sum(dfComms$size[dfComms$size>10^4])/sum(dfComms$size) | |
| dfComms[dfComms$size>10^4, ] | |
| top<-dfComms[dfComms$size>10^4, ] | |
| top$majLangPercent<-round((top$majLangCount/top$size)*100,1) | |
| top[order(top$size,decreasing=TRUE), c("majLang","majLangPercent","numLangs","size")] | |
| head(dfComms[order(dfComms$size,decreasing=TRUE), c("majLang","majLangPercent","numLangs","size")],n=25) | |
| sort(table(dfComms$majLang)) | |
| #Is 48 the total number of langs in the network? | |
| length(unique(V(gMin5LCCUndirected)$majLangAdj)) | |
| dfComms$majLangPercent<-dfComms$majLangCount/dfComms$size | |
| summary(dfComms) | |
| with(dfComms,plot(log(size),majLangPercent)) | |
| max(dfComms$size) | |
| dfComms$label[dfComms$size==max(dfComms$size)] | |
| #label 16 is quite large | |
| svg("communities_size_percent.svg",width=svgWidth,height=svgHeight) | |
| plot<-ggplot(dfComms,aes(x=size,y=majLangPercent,color=majLang)) | |
| plot<-plot+geom_point() | |
| plot<- plot + scale_x_log10("Cluster size", | |
| breaks = trans_breaks("log10", function(x) 10^x), | |
| labels = trans_format("log10", math_format(10^.x))) | |
| plot<-plot + scale_y_continuous("Percentage of users classified in dominant language",labels=percent) | |
| plot<-plot + theme_bw() + | |
| theme(legend.title=element_text(size=18),legend.text=element_text(size=16), | |
| axis.title.x=element_text(size=18),axis.text.x=element_text(size=16), | |
| axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none") | |
| plot | |
| dev.off() | |
| svg("communities_size_percent_labels.svg",width=svgWidth,height=svgHeight) | |
| plot<-ggplot(dfComms,aes(x=size,y=majLangPercent,label=majLang)) | |
| plot<-plot+geom_text() | |
| plot<- plot + scale_x_log10("Cluster size", | |
| breaks = trans_breaks("log10", function(x) 10^x), | |
| labels = trans_format("log10", math_format(10^.x))) | |
| plot<-plot + scale_y_continuous("Percentage of users classified in dominant language",labels=percent) | |
| plot<-plot + theme_bw() + | |
| theme(legend.title=element_text(size=18),legend.text=element_text(size=16), | |
| axis.title.x=element_text(size=18),axis.text.x=element_text(size=16), | |
| axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none") | |
| plot | |
| dev.off() | |
| svg("communities_size_percent_labels_partial.svg",width=svgWidth,height=svgHeight) | |
| plot<-ggplot(dfComms,aes(x=size,y=majLangPercent,color=majLang)) | |
| plot<-plot+geom_point() | |
| plot<-plot+geom_text(data=subset(dfComms,size>10^4),aes(x=size,y=majLangPercent+0.025,label=majLang)) | |
| plot<- plot + scale_x_log10("Cluster size", | |
| breaks = trans_breaks("log10", function(x) 10^x), | |
| labels = trans_format("log10", math_format(10^.x))) | |
| plot<-plot + scale_y_continuous("Percentage of users classified in dominant language",labels=percent) | |
| plot<-plot + theme_bw() + | |
| theme(legend.title=element_text(size=18),legend.text=element_text(size=16), | |
| axis.title.x=element_text(size=18),axis.text.x=element_text(size=16), | |
| axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none") | |
| plot | |
| dev.off() | |
| dfComms[dfComms$size>10^4, ] | |
| #hist(log(dfComms$size)) | |
| svg("community_size.svg",width=svgWidth,height=svgHeight) | |
| plot<-ggplot(dfComms,aes(x=size)) | |
| plot<-plot+geom_histogram(binwidth=0.15) | |
| plot<- plot + scale_x_log10("Cluster size", | |
| breaks = trans_breaks("log10", function(x) 10^x), | |
| labels = trans_format("log10", math_format(10^.x))) | |
| plot<-plot + scale_y_continuous("Frequency",labels=comma) | |
| plot<-plot + theme_bw() + | |
| theme(legend.title=element_text(size=18),legend.text=element_text(size=16), | |
| axis.title.x=element_text(size=18),axis.text.x=element_text(size=16), | |
| axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none") | |
| plot | |
| dev.off() | |
| svg("community_numLangs_hist.svg",width=svgWidth,height=svgHeight) | |
| plot<-ggplot(dfComms,aes(x=numLangs)) | |
| plot<-plot+geom_histogram(binwidth=1) | |
| plot<- plot + scale_x_continuous("Number of languages per cluster") | |
| #plot<-plot + scale_y_continuous("Percent of users in communities with this number of languages or fewer (cdf)",labels=percent) | |
| plot<-plot + scale_y_continuous("Frequency",labels=comma) | |
| plot<-plot + theme_bw() + | |
| theme(legend.title=element_text(size=18),legend.text=element_text(size=16), | |
| axis.title.x=element_text(size=18),axis.text.x=element_text(size=16), | |
| axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none") | |
| plot | |
| dev.off() | |
| svg("community_numLangs_hist_zoom.svg",width=svgWidth,height=svgHeight) | |
| zplot<-ggplot(subset(dfComms,numLangs<=5),aes(x=as.factor(numLangs))) | |
| zplot<-zplot+geom_histogram(binwidth=1) | |
| zplot<-zplot + scale_x_discrete("Number of languages per cluster") | |
| zplot<-zplot + scale_y_continuous("Frequency",labels=comma) | |
| zplot<-zplot + theme_bw() + | |
| theme(legend.title=element_text(size=18),legend.text=element_text(size=16), | |
| axis.title.x=element_text(size=18),axis.text.x=element_text(size=16), | |
| axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none") | |
| zplot | |
| dev.off() | |
| #dfDensity<-densityDataframe(dfComms,"numLangs",NA,n=2048,adjust=8) | |
| #dplot <- ggplot(dfDensity,aes(x=x10,y=y)) + geom_path() | |
| dfTmp<-as.data.frame(table(dfComms$numLangs)) | |
| insert<-ggplot(dfTmp,aes(x=as.numeric(Var1),y=Freq))+geom_point()+ | |
| scale_y_log10("", | |
| breaks = trans_breaks("log10", function(x) 10^x), | |
| labels = trans_format("log10", math_format(10^.x)))+ | |
| scale_x_continuous("")+ theme_bw() | |
| svg("community_numLangs_hist_insert.svg",width=svgWidth,height=svgHeight) | |
| print(zplot) | |
| #print(plot+scale_x_continuous("")+scale_y_continuous("",labels=comma), | |
| print(insert, | |
| vp=viewport(width=0.6,height=0.6,x=1,y=1,just=c("right","top"))) | |
| dev.off() | |
| N <- max(memberships$V2) | |
| if (NUM.CORES>1) { | |
| cl <- makeCluster(NUM.CORES) | |
| registerDoParallel(cl, cores = NUM.CORES) | |
| } | |
| dfCommsShuffled100 <- foreach(run = 1:NUM.RUNS, .packages = c("igraph"), | |
| .combine = rbind) %dopar% { | |
| dfCommsShuffled <- data.frame(num=seq(1, N), size=rep(0, N), numLangs=rep(0, N), majLangCount=rep(0,N), majLang=rep("", N),stringsAsFactors=FALSE) | |
| print(paste0("Run: ",run)) | |
| V(gMin5LCCUndirected)$majLangAdjShuffled<-sample(V(gMin5LCCUndirected)$majLangAdj)#Random permutation | |
| for (comm in seq(1,N)) { | |
| size<-length(memberships$V2[memberships$V2==comm]) | |
| langs<-V(gMin5LCCUndirected)$majLangAdjShuffled[memberships$V2==comm] | |
| numLangs<-length(table(langs)) | |
| majLang <- unique(langs) | |
| majLang <- majLang[which.max(tabulate(match(langs,majLang)))] | |
| majLangCount<-sum(langs==majLang) | |
| dfCommsShuffled[comm, ] <- list(comm, size, numLangs, majLangCount, majLang) | |
| } | |
| write.csv(dfCommsShuffled,paste0("dfCommsShuffled_run_",run,".csv")) | |
| return(dfCommsShuffled) | |
| } | |
| #Collapse dfComms | |
| write.csv(dfCommsShuffled100,"dfCommsShuffled.csv") | |
| dfCommsShuffled<-ddply(dfCommsShuffled100,.(num),summarize, | |
| size=mean(size),sizeSD=sd(size), | |
| numLangs=mean(numLangs),numLangsSD=sd(numLangs), | |
| majLangCount=mean(majLangCount),majLangCountSD=sd(majLangCount), | |
| majLang="skip") | |
| dfCommsShuffled$majLangPercent<-dfCommsShuffled$majLangCount/dfCommsShuffled$size | |
| warnings() | |
| if (NUM.CORES>1) { | |
| stopCluster(cl) | |
| } | |
| svg("community_numLangs_hist_shuffled.svg",width=svgWidth,height=svgHeight) | |
| plot<-ggplot(dfCommsShuffled,aes(x=numLangs)) | |
| plot<-plot+geom_histogram(binwidth=1) | |
| plot<- plot + scale_x_continuous("Number of languages per community (when language labels are randomly shuffled)") | |
| plot<-plot + scale_y_continuous("Frequency",labels=comma) | |
| plot<-plot + theme_bw() + | |
| theme(legend.title=element_text(size=18),legend.text=element_text(size=16), | |
| axis.title.x=element_text(size=18),axis.text.x=element_text(size=16), | |
| axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none") | |
| plot | |
| dev.off() | |
| summary(dfComms$numLangs) | |
| summary(dfCommsShuffled$numLangs) | |
| length(dfComms$numLangs[dfComms$numLangs==1]) | |
| length(dfCommsShuffled$numLangs[dfCommsShuffled$numLangs==1]) | |
| dfComms$shuffled<-"Observed data" | |
| dfCommsShuffled$shuffled<-"Language labels shuffled" | |
| dfTmp<-rbind(dfComms[,c("size","numLangs","majLangCount","shuffled")],dfCommsShuffled[,c("size","numLangs","majLangCount","shuffled")]) | |
| dfDensity<-densityDataframe(dfTmp,"numLangs","shuffled",n=2048,adjust=6) | |
| dfDensitySum<-densitySummary(dfTmp,"numLangs","shuffled") | |
| pNumLangs<-densityPlot(dfDensity,dfDensitySum,"Number of languages per community") | |
| svg("community_numLangs_density.svg",width=svgWidth,height=svgHeight) | |
| pNumLangs | |
| dev.off() | |
| #difference of means t-test | |
| t.test(x=dfComms$numLangs,y=dfCommsShuffled$numLangs,alternative="two.sided") | |
| # p-value < 2.2e-16 | |
| } | |
| #spinglassComm<-spinglass.community(gMin5LCCUndirected, weights=NA) | |
| #write.csv(spinglassComm$membership,"spinglassCommMembership.csv") | |
| modularity(gMin5LCCUndirected,as.factor(V(gMin5LCCUndirected)$majLangAdj)) | |
| #[1] 0.6489853 ge2? | |
| #[1] 0.6291782 ge4_ge2-20 | |
| #[1] 0.6671592 ge4_ge2-20_inms | |
| ################################################### | |
| ## Q3: Lang outwardness ## | |
| ################################################### | |
| #TODO: Include all languages???? | |
| sort(table(V(gMin5LCCUndirected)$majLangAdj)) | |
| #langList<-sort(c("in","es","ms","pt","ja","en","ru","de","tr","it","fil","fr","ar","th","ko","nl")) | |
| #gLangs<-induced.subgraph(gMin5LCC,V(gMin5LCC)$majLangAdj %in% langList) | |
| langList<-sort(unique(V(gMin5LCCUndirected)$majLangAdj))#Must be in same order as langSizes!!! | |
| gLangs<-gMin5LCC | |
| sort(table(V(gLangs)$majLangAdj)) | |
| #langs <- unique(V(gLangs)$majLangAdj) | |
| langSizes<-table(V(gLangs)$majLangAdj) | |
| totalNodes<-length(V(gLangs)) #Check if this needs $something? | |
| totalEdges<-length(E(gLangs)) #Check if this needs $weight? | |
| #Check distribution of majLangPercentAdj for different languages | |
| library(plyr) | |
| dfMultilingual<-data.frame(lang=V(gMin5LCCUndirected)$majLangAdj,percent=V(gMin5LCCUndirected)$majLangPercentAdj) | |
| dfMultilingual$lang<-as.factor(dfMultilingual$lang) | |
| dfMultiCollapse<-ddply(dfMultilingual,.(lang),function(df) {c(length(df$percent),min(df$percent),max(df$percent),mean(df$percent),median(df$percent),length(df$percent[df$percent!=1]))}) | |
| names(dfMultiCollapse)<-c("lang","nodeCount","min","max","mean","median","multilingualCount") | |
| dfMultiCollapse$multiPercent<-dfMultiCollapse$multilingualCount/dfMultiCollapse$nodeCount | |
| largeLangs<-as.character(dfMultiCollapse$lang[dfMultiCollapse$nodeCount>1000]) | |
| plot(log(dfMultiCollapse$nodeCount),dfMultiCollapse$multiPercent) | |
| dfMultiCollapse[order(dfMultiCollapse$nodeCount),c("lang","nodeCount","multiPercent")] | |
| dfMultiCollapseSub<-subset(dfMultiCollapse,nodeCount>1000) | |
| with(dfMultiCollapseSub,plot(log(nodeCount),multiPercent)) | |
| svg("multilingualism_ge1000.svg",width=svgWidth,height=svgHeight) | |
| plot<-ggplot(subset(dfMultiCollapse,nodeCount>1000),aes(x=nodeCount,y=multiPercent)) | |
| plot<-plot+geom_point() | |
| plot<- plot + scale_x_log10("Language size (log of number of users)", | |
| breaks = trans_breaks("log10", function(x) 10^x), | |
| labels = trans_format("log10", math_format(10^.x))) | |
| plot<-plot + scale_y_continuous("Percentage of users classified as multiligual",labels=percent) | |
| plot<-plot + theme_bw() + | |
| theme(legend.title=element_text(size=18),legend.text=element_text(size=16), | |
| axis.title.x=element_text(size=18),axis.text.x=element_text(size=16), | |
| axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none") | |
| plot | |
| dev.off() | |
| svg("multilingualism.svg",width=svgWidth,height=svgHeight) | |
| plot<-ggplot(dfMultiCollapse,aes(x=nodeCount,y=multiPercent)) | |
| plot<-plot+geom_point() | |
| plot<- plot + scale_x_log10("Language size (log of number of users)", | |
| breaks = trans_breaks("log10", function(x) 10^x), | |
| labels = trans_format("log10", math_format(10^.x))) | |
| plot<-plot + scale_y_continuous("Percentage of users classified as multiligual",labels=percent) | |
| plot<-plot + theme_bw() + | |
| theme(legend.title=element_text(size=18),legend.text=element_text(size=16), | |
| axis.title.x=element_text(size=18),axis.text.x=element_text(size=16), | |
| axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none") | |
| plot | |
| dev.off() | |
| svg("multilingualism-labels.svg",width=svgWidth,height=svgHeight) | |
| plot<-ggplot(dfMultiCollapse,aes(x=nodeCount,y=multiPercent,label=lang)) | |
| plot<-plot+geom_text()#Use size= here to control size | |
| plot<- plot + scale_x_log10("Language size (number of users)", | |
| breaks = trans_breaks("log10", function(x) 10^x), | |
| labels = trans_format("log10", math_format(10^.x))) | |
| plot<-plot + scale_y_continuous("Percentage of users classified as multiligual",labels=percent) | |
| plot<-plot + theme_bw() + | |
| theme(legend.title=element_text(size=18),legend.text=element_text(size=16), | |
| axis.title.x=element_text(size=18),axis.text.x=element_text(size=16), | |
| axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none") | |
| plot | |
| dev.off() | |
| svg("multilingualism-labels_and_point.svg",width=svgWidth,height=svgHeight) | |
| plot<-ggplot(dfMultiCollapse,aes(x=nodeCount,y=multiPercent,label=lang)) + geom_point() | |
| plot<-plot+geom_text(aes(y=multiPercent+0.02))#Use size= here to control size | |
| plot<- plot + scale_x_log10("Language size (log of number of users)", | |
| breaks = trans_breaks("log10", function(x) 10^x), | |
| labels = trans_format("log10", math_format(10^.x))) | |
| plot<-plot + scale_y_continuous("Percentage of users classified as multiligual",labels=percent) | |
| plot<-plot + theme_bw() + | |
| theme(legend.title=element_text(size=18),legend.text=element_text(size=16), | |
| axis.title.x=element_text(size=18),axis.text.x=element_text(size=16), | |
| axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none") | |
| plot#+stat_smooth(method="loess") | |
| #method="lm","gam"/library(mgcv) (http://www.inside-r.org/r-doc/mgcv/gam) | |
| dev.off() | |
| #Correlation between size/mutlilingualism? | |
| cor(dfMultiCollapse$nodeCount,dfMultiCollapse$multiPercent) | |
| with(dfMultiCollapse,cor(log(nodeCount),multiPercent)) | |
| with(subset(dfMultiCollapse,nodeCount<=1000),cor(log(nodeCount),multiPercent)) | |
| with(subset(dfMultiCollapse,nodeCount<=10^5),cor(log(nodeCount),multiPercent)) | |
| coef(lm(log(nodeCount) ~ multiPercent, data = dfMultiCollapse)) | |
| cor.test(dfMultiCollapse$nodeCount,dfMultiCollapse$multiPercent,alternative="less",conf.level=0.95) | |
| cor.test(log(dfMultiCollapse$nodeCount),dfMultiCollapse$multiPercent,alternative="less",conf.level=0.95) | |
| # method = c("pearson", "kendall", "spearman") | |
| #dfMultiCollapseSub[order(dfMultiCollapseSub$multiPercent),c("lang","nodeCount","multiPercent")] | |
| rm(dfMultiCollapseSub) | |
| dfMultiCollapse[order(dfMultiCollapse$multiPercent),c("lang","nodeCount","multiPercent")] | |
| dfMultiCollapse[order(dfMultiCollapse$nodeCount),c("lang","nodeCount","multiPercent")] | |
| N<-length(langList) | |
| dfOut <- data.frame(lang=rep("", N), nodeCount=rep(0, N), edgeCount=rep(0, N), | |
| inLangEdges=rep(0, N),expInLangEdges=rep(0, N), | |
| stringsAsFactors=FALSE) | |
| i<-0 | |
| for (lang in langList) { | |
| #langEdges<-length( | |
| # E(gLangs)[E(gLangs)$source %in% | |
| # V(gLangs)[V(gLangs)$majLangAdj==lang] | |
| # ]) | |
| langEdges<-sum(degree(gLangs,V(gLangs)$majLangAdj==lang,mode="out")) | |
| inGroupEdges<- | |
| length(E(induced.subgraph(gLangs,V(gLangs)$majLangAdj==lang))) | |
| #bwGroupEdges<-langEdges-inGroupEdges | |
| expInGroupEdges<-(as.numeric(langSizes[lang])/totalNodes)*langEdges | |
| i<-i+1 | |
| dfOut[i, ] <- list(lang,as.numeric(langSizes[lang]),langEdges,inGroupEdges,expInGroupEdges) | |
| } | |
| #dfOut$nodeCount<-as.numeric(dfOut$nodeCount) | |
| #dfOut$edgeCount<-as.numeric(dfOut$edgeCount) | |
| #dfOut$inLangEdges<-as.numeric(dfOut$inLangEdges) | |
| #dfOut$expInLangEdges<-as.numeric(dfOut$expInLangEdges) | |
| dfOut$diff<-dfOut$inLangEdges-dfOut$expInLangEdges | |
| dfOut$fracIn<-dfOut$inLangEdges/dfOut$edgeCount | |
| dfOut$expFrac<-dfOut$expInLangEdges/dfOut$edgeCount | |
| dfOut$nodeFrac<-dfOut$nodeCount/totalNodes | |
| dfOut$pError<-dfOut$diff/dfOut$expInLangEdges | |
| dfOut$diffPercent<-abs(dfOut$diff)/((dfOut$inLangEdges+dfOut$expInLangEdges)/2) | |
| dfOut$logFracIn<-log(dfOut$inLangEdges)/log(dfOut$edgeCount) | |
| dfOut$zscore<-(dfOut$inLangEdges-mean(dfOut$inLangEdges))/sd(dfOut$inLangEdges) | |
| write.csv(dfOut,"dfOut_directed.csv") | |
| with(subset(dfOut,fracIn>0.8),plot(log(nodeCount),fracIn)) | |
| source("../tweet-langs-inms.R") | |
| dfTweetLangs<-tweetLangAll() | |
| dfTweetLangs<-subset(dfTweetLangs,lang %in% langList) | |
| totalTweets<-sum(dfTweetLangs$count) | |
| dfTweetLangs$percent<-dfTweetLangs$count/totalTweets | |
| dfOut$tweetCount<-NA | |
| dfOut$tweetPercent<-NA | |
| for (lang in langList) { | |
| dfOut$tweetCount[dfOut$lang==lang]<-dfTweetLangs$count[dfTweetLangs$lang==lang] | |
| dfOut$tweetPercent[dfOut$lang==lang]<-dfTweetLangs$percent[dfTweetLangs$lang==lang] | |
| } | |
| dfOut$tweetExpEdges<-dfOut$tweetPercent*dfOut$edgeCount | |
| dfOut$tweetdiff<-dfOut$inLangEdges-dfOut$tweetExpEdges | |
| #Table \label{tbl:inwardness} | |
| dfSum<-subset(dfOut,nodeCount>1000)[ , c("lang","nodeFrac","fracIn") ] | |
| dfSum$nodeFrac<-round(dfSum$nodeFrac*100,2) | |
| dfSum$fracIn<-round(dfSum$fracIn*100,2) | |
| dfSum[order(dfSum$fracIn, decreasing=TRUE), ] | |
| ggplot(dfSum,aes(label=lang,x=nodeFrac,y=fracIn))+geom_text() | |
| #ident <- function(x) {(x/totalNodes)*totalEdges} | |
| # | |
| plot <- ggplot(dfOut,aes(x=nodeCount,y=fracIn)) + geom_point() | |
| #plot <- plot + stat_function(fun = ident, linetype="dashed") | |
| plot <- plot + scale_x_log10("Language size (log of number of users)", | |
| breaks = trans_breaks("log10", function(x) 10^x), | |
| labels = trans_format("log10", math_format(10^.x))) | |
| plot | |
| #ident <- function(x) {x} | |
| plot <- ggplot(dfOut,aes(x=nodeCount,y=diff)) + geom_point() | |
| #plot <- plot + stat_function(fun = ident, linetype="dashed") | |
| plot <- plot + stat_smooth(method="lm", se=TRUE) | |
| plot <- plot + scale_x_log10("Language size (log of number of users)", | |
| breaks = trans_breaks("log10", function(x) 10^x), | |
| labels = trans_format("log10", math_format(10^.x))) | |
| plot <- plot + scale_y_log10("Difference from expected (log)", | |
| breaks = trans_breaks("log10", function(x) 10^x), | |
| labels = trans_format("log10", math_format(10^.x))) | |
| plot | |
| dfA<-data.frame(lang=rep("", N),var=rep("", N),val=rep(0, N)) | |
| dfA$lang<-dfOut$lang | |
| dfA$var<-"fracIn" | |
| dfA$val<-dfOut$fracIn | |
| dfB<-data.frame(lang=rep("", N),var=rep("", N),val=rep(0, N)) | |
| dfB$lang<-dfOut$lang | |
| dfB$var<-"fracExp" | |
| dfB$val<-dfOut$expFrac | |
| dfTrans<-rbind(dfA,dfB) | |
| rm(dfA) | |
| rm(dfB) | |
| dfTrans$nodeCount<-dfOut$nodeCount | |
| plot <- ggplot(dfTrans,aes(x=nodeCount,y=val,group=var,color=var)) + geom_point() | |
| plot <- plot + scale_x_log10("Language size (log of number of users)", | |
| breaks = trans_breaks("log10", function(x) 10^x), | |
| labels = trans_format("log10", math_format(10^.x))) | |
| #plot <- plot + scale_y_log10("Difference from expected (log)", | |
| # breaks = trans_breaks("log10", function(x) 10^x), | |
| # labels = trans_format("log10", math_format(10^.x))) | |
| plot | |
| #E(gLangs)$sourceLang<-"" | |
| #E(gLangs)$targetLang<-"" | |
| #edges<-get.edges(gLangs, E(gLangs)) | |
| #edgeLangs<-data.frame(source=rep("",length(edges)), target=rep("",length(edges)),stringsAsFactors=FALSE) | |
| #i<-0 | |
| #for (edge in edges) { | |
| # s<-edges[edge,1] | |
| # t<-edges[edge,2] | |
| # sLang<-V(gLangs)[s]$majLangAdj | |
| # tLang<-V(gLangs)[t]$majLangAdj | |
| # i<-i+1 | |
| # edgeLangs[i, ] <- c(sLang,tLang) | |
| #} | |
| #gCollapsed<-contract.vertices(gLangs, which(langList==V(gLangs)$majLangAdj)) | |
| levels<-as.factor(V(gLangs)$majLangAdj) | |
| edges<-get.edges(gLangs, E(gLangs)) | |
| E(gLangs)$source<-edges[,1] | |
| E(gLangs)$target<-edges[,2] | |
| gCollapsed<-contract.vertices(gLangs, levels,vertex.attr.comb="first") | |
| try({ | |
| gCollapsed<-remove.edge.attribute(gCollapsed, "sourceLang") | |
| gCollapsed<-remove.edge.attribute(gCollapsed, "targetLang") | |
| }) | |
| gCollapsed<-simplify(gCollapsed,remove.loops=TRUE,remove.multiple=FALSE) | |
| E(gCollapsed)$weight<-1 | |
| gCollapsed<-simplify(gCollapsed,remove.multiple=TRUE,edge.attr.comb=list(weight="sum",source=function(x) sum(!duplicated(x)),target=function(x) sum(!duplicated(x)))) | |
| summary(gCollapsed) | |
| V(gCollapsed)$label<-V(gCollapsed)$majLangAdj | |
| V(gCollapsed)$name<-V(gCollapsed)$majLangAdj | |
| V(gCollapsed)$langSize<-0 | |
| for (index in V(gCollapsed)) { | |
| l<-V(gCollapsed)[index]$majLangAdj | |
| V(gCollapsed)[index]$langSize<-langSizes[which(langList==l)] | |
| } | |
| V(gCollapsed)$langSizeLog<-log(V(gCollapsed)$langSize) | |
| #New insertion 2013-12-19################### | |
| #(graph without normalization / expected a la Wikipedia article (ultimately not used in published paper, but retained for reference) | |
| E(gCollapsed)$mentionCount<-E(gCollapsed)$weight | |
| E(gCollapsed)$weightLog<-log(E(gCollapsed)$source) | |
| weightLogSD<-sd(E(gCollapsed)$weightLog) | |
| weightLogMean<-mean(E(gCollapsed)$weightLog) | |
| print(paste0("Mean: ",weightLogMean)) | |
| print(paste0("95% Mark: ",weightLogMean+(1.96*weightLogSD))) | |
| #Add some columns for use in visualization | |
| #Random bug, igraph doesn't write boolean values to graph output files; so, use numbers | |
| E(gCollapsed)$weightLog_geMean<-ifelse(E(gCollapsed)$weightLog>=weightLogMean,1,0) | |
| E(gCollapsed)$weightLog_ge1SD<-ifelse(E(gCollapsed)$weightLog>=weightLogMean+weightLogSD,1,0) | |
| E(gCollapsed)$weightLog_ge95<-ifelse(E(gCollapsed)$weightLog>=weightLogMean+(1.96*weightLogSD),1,0) | |
| E(gCollapsed)$weightLog_diffMean<-E(gCollapsed)$weightLog-weightLogMean | |
| #Repeat with percent of users in source lang mention users in target lang? | |
| edges<-get.edges(gCollapsed, E(gCollapsed)) | |
| E(gCollapsed)$weightPercent<-0 | |
| for (edge in E(gCollapsed)) { | |
| s=V(gCollapsed)[edges[[edge,1]]]$name | |
| E(gCollapsed)[edge]$weightPercent<-E(gCollapsed)[edge]$source/langSizes[which(langList==s)] | |
| } | |
| E(gCollapsed)$weight<-E(gCollapsed)$weightLog | |
| write.graph(gCollapsed,"gCollasped_nonnormalized_usercounts.graphml",format="graphml") | |
| #List top edges by weightlog (but give actual weight) | |
| tmp<-get.edgelist(gCollapsed) | |
| tmp<-cbind(as.data.frame(tmp),E(gCollapsed)$weightLog,E(gCollapsed)$source,E(gCollapsed)$weightPercent) | |
| names(tmp)<-c("source","target","weightLog","weight","weightPercent") | |
| #head(tmp[order(tmp$weight,decreasing=TRUE), ],n=25) | |
| head(tmp[order(tmp$weight,decreasing=TRUE), c("source","target","weight","weightPercent")],n=7) | |
| #Filter to ge95 and drop isolates | |
| gTmp<-gCollapsed | |
| gTmp<-delete.edges(gTmp,E(gTmp)[E(gTmp)$weightLog_ge95!=1]) | |
| V(gTmp)$degree<-degree(gTmp) | |
| gTmp<-delete.vertices(gTmp,V(gTmp)$degree==0) | |
| #Repeat everything above with English removed? | |
| ################End insertion 2013-12-19################################## | |
| #Add missing edges with weight of 0 | |
| edgelist<-as.data.frame(get.edgelist(gCollapsed,names=TRUE)) | |
| names(edgelist)=c("source","target") | |
| edgelist$exists<-TRUE | |
| N<-length(langList)*(length(langList)-1) | |
| dfEdgesFull<-data.frame(source=rep("",N),target=rep("",N),stringsAsFactors=FALSE) | |
| i<-0 | |
| for (source in langList) { | |
| for (target in langList) { | |
| if (target!=source) { | |
| i<-i+1 | |
| dfEdgesFull[i, ]<-c(source,target) | |
| } | |
| } | |
| } | |
| dfEdgesFull<-merge(x=dfEdgesFull,y=edgelist,all=TRUE) | |
| missing<-dfEdgesFull[is.na(dfEdgesFull$exists), ] | |
| missing<-data.frame(source=missing$source,target=missing$target) | |
| medges<-c() | |
| for (index in seq(1,length(missing$source))) { | |
| medges<-c(medges,as.character(missing[index, "source"]), | |
| as.character(missing[index, "target"]) | |
| ) | |
| } | |
| gCollapsed<-add.edges(gCollapsed,medges,attr=list(weight=0)) | |
| summary(gCollapsed) | |
| #V(gCollapsed)$outDegree<-degree(gCollapsed,V(gCollapsed),mode="out") #Not out-degree, sum of weight on out edges | |
| V(gCollapsed)$outWeight<-0 | |
| E(gCollapsed)$expected<-NA | |
| edges<-get.edges(gCollapsed, E(gCollapsed)) | |
| for (edge in seq(1,length(edges)/2)) { | |
| s<-edges[edge,1] | |
| V(gCollapsed)[s]$outWeight<-V(gCollapsed)[s]$outWeight + E(gCollapsed)[edge]$weight | |
| } | |
| for (edge in seq(1,length(edges)/2)) { | |
| s<-edges[edge,1] | |
| t<-edges[edge,2] | |
| sLang<-V(gCollapsed)[s]$majLangAdj | |
| tLang<-V(gCollapsed)[t]$majLangAdj | |
| #TODO: Demoninator needs to be reduced by the size of the source language!!! | |
| #sLangSize<-langSizes[which(langList==sLang)] | |
| #E(gCollapsed)[edge]$expected<-(langSizes[which(langList==tLang)]/(totalNodes-sLangSize)) * V(gCollapsed)[s]$outWeight | |
| E(gCollapsed)[edge]$expected<-(langSizes[which(langList==tLang)]/totalNodes) * V(gCollapsed)[s]$outWeight | |
| } | |
| summary(E(gCollapsed)$weight) | |
| summary(E(gCollapsed)$expected) | |
| E(gCollapsed)$pError<-(E(gCollapsed)$weight-E(gCollapsed)$expected)/E(gCollapsed)$expected | |
| E(gCollapsed)$pErrorZ<-(E(gCollapsed)$pError-mean(E(gCollapsed)$pError,na.rm=TRUE))/sd(E(gCollapsed)$pError,na.rm=TRUE) | |
| E(gCollapsed)$diffPercent<-abs(E(gCollapsed)$weight-E(gCollapsed)$expected)/((E(gCollapsed)$weight+E(gCollapsed)$expected)/2) | |
| E(gCollapsed)$zscore<-(E(gCollapsed)$weight-mean(E(gCollapsed)$weight))/sd(E(gCollapsed)$weight) | |
| summary(E(gCollapsed)$pError) | |
| summary(E(gCollapsed)$zscore) | |
| write.graph(gCollapsed,"gCollapsed.graphml",format="graphml") | |
| gCollapsedLargeLangs<-induced.subgraph(gCollapsed,V(gCollapsed)$majLangAdj %in% largeLangs) | |
| E(gCollapsedLargeLangs)$pErrorZ<-(E(gCollapsedLargeLangs)$pError-mean(E(gCollapsedLargeLangs)$pError,na.rm=TRUE))/sd(E(gCollapsedLargeLangs)$pError,na.rm=TRUE) | |
| E(gCollapsedLargeLangs)$zscore<-(E(gCollapsedLargeLangs)$weight-mean(E(gCollapsedLargeLangs)$weight))/sd(E(gCollapsedLargeLangs)$weight) | |
| write.graph(gCollapsedLargeLangs,"gCollapsedLargeLangs.graphml",format="graphml") | |
| edgeDataset<-function(graph) { | |
| N<-length(E(graph)) | |
| dfEdges<-data.frame(source=rep("",N),target=rep("",N),pError=rep(NA,N),zscore=rep(NA,N),pErrorZ=rep(NA,N),stringsAsFactors=FALSE) | |
| edges<-get.edges(graph, E(graph)) | |
| for (edge in seq(1,N)) { | |
| s<-edges[edge,1] | |
| t<-edges[edge,2] | |
| sLang<-V(graph)[s]$majLangAdj | |
| tLang<-V(graph)[t]$majLangAdj | |
| pError<-E(graph)[edge]$pError | |
| zscore<-E(graph)[edge]$zscore | |
| pErrorZ<-E(graph)[edge]$pErrorZ | |
| dfEdges[edge, ]<-list(sLang,tLang,pError,zscore,pErrorZ) | |
| } | |
| return(dfEdges) | |
| } | |
| #Table \label{tbl:langlang-more} | |
| #head(dfEdges[order(dfEdges$pError,decreasing=TRUE),c("source","target","pError")],n=10) | |
| dfEdgesLarge<-edgeDataset(gCollapsedLargeLangs) | |
| dfEdgesLarge$pError100<-round(dfEdgesLarge$pError*100) | |
| head(dfEdgesLarge[order(dfEdgesLarge$pError,decreasing=TRUE),c("source","target","pError100")],n=10) | |
| #subset(dfEdges,pErrorZ>=1.96|pErrorZ<=1.96) | |
| dfEdges<-edgeDataset(gCollapsed)#Is this really the same? Shouldn't it be different | |
| dfEdgesSub<-subset(dfEdges,source%in%largeLangs & target%in%largeLangs) | |
| head(dfEdgesSub[order(dfEdgesSub$pError,decreasing=TRUE),c("source","target","pError")],n=10) | |
| rm(dfEdgesSub) | |
| dfEdgesSub<-subset(dfEdges,zscore > 1.96| zscore < -1.96) | |
| dfEdgesSub[order(dfEdgesSub$zscore,decreasing=TRUE),c("source","target","zscore")] | |
| rm(dfEdgesSub) | |
| dfEdges$pErrorZ<-(dfEdges$pError-mean(dfEdges$pError,na.rm=TRUE))/sd(dfEdges$pError,na.rm=TRUE) | |
| dfEdgesSub<-subset(dfEdges,pErrorZ > 1.96 | pErrorZ < -1.96) | |
| head(dfEdgesSub[order(dfEdgesSub$pErrorZ,decreasing=TRUE),c("source","target","pError","pErrorZ")],n=10) | |
| rm(dfEdgesSub) | |
| dfEdgesSub<-subset(dfEdges,source%in%largeLangs & target%in%largeLangs) | |
| dfEdgesSub$pErrorZ<-(dfEdgesSub$pError-mean(dfEdgesSub$pError,na.rm=TRUE))/sd(dfEdgesSub$pError,na.rm=TRUE) | |
| dfEdgesSub<-subset(dfEdgesSub,pErrorZ > 1.96 | pErrorZ < -1.96) | |
| head(dfEdgesSub[order(dfEdgesSub$pErrorZ,decreasing=TRUE),c("source","target","pError","pErrorZ")],n=10) | |
| rm(dfEdgesSub) | |
| #N<-length(largeLangs)*(length(largeLangs)-1) | |
| #dfEdgesFull<-data.frame(source=rep("",N),target=rep("",N),stringsAsFactors=FALSE) | |
| #i<-0 | |
| #for (source in largeLangs) { | |
| # for (target in largeLangs) { | |
| # if (target!=source) { | |
| # i<-i+1 | |
| # dfEdgesFull[i, ]<-c(source,target) | |
| # } | |
| # } | |
| #} | |
| # | |
| # | |
| #dfEdgesFull<-merge(x=dfEdgesFull,y=dfEdgesLarge,all=TRUE) | |
| #dfEdgesFull[is.na(dfEdgesFull$pError), ] | |
| # | |
| ##Any pairs with no edges in either direction? (\label{tbl:langlang-less}) | |
| #tmp<-dfEdgesFull[is.na(dfEdgesFull$pError), c("source","target")] | |
| #dis<-data.frame(source=tmp$source,target=tmp$target) | |
| #dis2<-data.frame(source=tmp$target,target=tmp$source) | |
| #reallyDis<-merge(x=dis,y=dis2,all=FALSE) | |
| #reallyDis | |
| #New version of R lists includes edges with weight 0, this results in pError of -1 | |
| sum(E(gCollapsedLargeLangs)$weight==0)==sum(dfEdgesLarge$pError==-1) | |
| sum(dfEdgesLarge$pError==-1) | |
| #24 disconnected pairs | |
| dfTmp<-dfEdgesLarge[dfEdgesLarge$pError==-1,c("source","target")] | |
| dfTmp$mutual<-0 | |
| for (i in seq(1,length(dfTmp$mutual))) { | |
| s<-dfTmp[i,"source"] | |
| t<-dfTmp[i,"target"] | |
| if (sum(dfTmp[dfTmp$source==t, ]$target==s)==1) { | |
| dfTmp[i,"mutual"]<-1 | |
| } | |
| } | |
| missingTable<-function(df) { | |
| str<-paste0(df$source," & ") | |
| for (i in length(df$target)) { | |
| if (df$mutual[i]) { | |
| str<-paste0(str,"\\emphas{",df$target[i],"},") | |
| } else { | |
| str<-paste0(str,df$target[i],",") | |
| } | |
| } | |
| print(str) | |
| return(str) | |
| } | |
| ddply(dfTmp,.(source),missingTable) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment