fuente: https://github.com/DiegoKoz/MIA_text_mining
Warning: Este dataset es muy pesado. Eso implica que lleva tiempo correr los modelos y que puede no entrar en la memoria de la computadora. Para la clase, se puede hacer un muestreo de textos para que no pese tanto
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
library(tidyverse)
[30m── [1mAttaching packages[22m ───────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──[39m
[30m[32m✔[30m [34mggplot2[30m 3.2.0 [32m✔[30m [34mpurrr [30m 0.3.2
[32m✔[30m [34mtibble [30m 2.1.3 [32m✔[30m [34mdplyr [30m 0.8.3
[32m✔[30m [34mtidyr [30m 0.8.3 [32m✔[30m [34mstringr[30m 1.4.0
[32m✔[30m [34mreadr [30m 1.3.1 [32m✔[30m [34mforcats[30m 0.4.0[39m
[30m── [1mConflicts[22m ──────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31m✖[30m [34mpurrr[30m::[32mis_null()[30m masks [34mtestthat[30m::is_null()
[31m✖[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()
[31m✖[30m [34mdplyr[30m::[32mmatches()[30m masks [34mtestthat[30m::matches()[39m
library(glue)
Attaching package: ‘glue’
The following object is masked from ‘package:dplyr’:
collapse
library(tm)
Loading required package: NLP
Attaching package: ‘NLP’
The following object is masked from ‘package:ggplot2’:
annotate
library(topicmodels)
library(tidytext)
library(stringi)
library(LDAvis)
library(slam)
library(tsne)
library(lubridate)
Attaching package: ‘lubridate’
The following object is masked from ‘package:base’:
date
library(DT)
library(lsa)
Loading required package: SnowballC
library(igraph)
Attaching package: ‘igraph’
The following objects are masked from ‘package:lubridate’:
%--%, union
The following objects are masked from ‘package:dplyr’:
as_data_frame, groups, union
The following objects are masked from ‘package:purrr’:
compose, simplify
The following object is masked from ‘package:tidyr’:
crossing
The following object is masked from ‘package:tibble’:
as_data_frame
The following objects are masked from ‘package:stats’:
decompose, spectrum
The following object is masked from ‘package:testthat’:
compare
The following object is masked from ‘package:base’:
union
library(ggraph)
library(tidygraph)
Attaching package: ‘tidygraph’
The following object is masked from ‘package:igraph’:
groups
The following object is masked from ‘package:stats’:
filter
The following object is masked from ‘package:testthat’:
matches
library(cluster)
library(plotly)
Attaching package: ‘plotly’
The following object is masked from ‘package:igraph’:
groups
The following object is masked from ‘package:ggplot2’:
last_plot
The following object is masked from ‘package:stats’:
filter
The following object is masked from ‘package:graphics’:
layout
df <- read_csv('../data/txt/texto_limpio.txt')
# df <- read_rds('data/MIA.RDS')
df <- df %>%
filter(tipo=='notas')
df <- df %>%
mutate(texto = tolower(texto),
texto = stri_trans_general(texto, "Latin-ASCII"),
texto = str_trim(texto,side = 'both'),
texto = str_replace_all(texto,'\t',' '),
texto = str_replace_all(texto,'\n',' '),
texto = str_replace_all(texto,'\r',' '),
texto = str_replace_all(texto,'[[:punct:]]',' '),
texto = str_remove_all(texto,'\\d'),
# texto = str_replace_all(texto,'\\d','NUM'),
# texto = str_replace_all(texto,'(NUM)+','NUM'),
texto = str_replace_all(texto,"\\s+", " "))
Para topic modeling las palabras comunes de la lengua generan mucho ruido y terminan predominnado en los topicos.
Vamos a eliminar no solo las Stop Words, sino también las palabras más utilizadas en el español que no están relacionadas con nuestra temática. Para eso, tenemos un archivo r_words.txt donde pusimos todas las palabras más comunes.
Además, aprovechamos para eliminar los tokens que quedaron del scrapeo que en realidad son parte del código html (ver final del archivo).
¿de donde salieron estos tokens? En una primera iteración del LDA, uno de los tópicos que se armó era de código html.
palabras_comunes <- read_csv(file = 'data/r_words.txt',col_names = F)
palabras_comunes <-stri_trans_general(palabras_comunes$X1, "Latin-ASCII") # le tengo que hacer la misma transformacion que al texto
stop_words <- stri_trans_general(stopwords(kind = "es"), "Latin-ASCII")
palabras_eliminar <- unique(c(stop_words,palabras_comunes))
rm(stop_words)
rm(palabras_comunes)
gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 2247014 120.1 3932032 210.0 3932032 210.0
Vcells 12114753 92.5 40208022 306.8 33439841 255.2
Corpus = VCorpus(VectorSource(df$texto))
Corpus = tm_map(Corpus, removeWords, palabras_eliminar)
# Corpus <- tm_map(Corpus, stemDocument, language = "spanish") # Corpus
# dtm <- DocumentTermMatrix(Corpus, control = list(weighting = function(x) weightTfIdf(x, normalize =TRUE)))
dtm <- DocumentTermMatrix(Corpus)
rm(Corpus)
gc()
# tm::nTerms(dtm)
#elimino los docuemntos vacios
# rowTotals <- rowSums(as.matrix(dtm))
# nDocs(dtm)
# dtm <- dtm[rowTotals> 0, ]
# nDocs(dtm)
write_rds(dtm, 'data/dtm_MIA.rds')
# df <- df[which(rowTotals>0),] #%>% #tengo que eliminar ese docuemnto que estaba vacio
dtm <- read_rds('data/dtm_MIA.rds')
limpio la memoria porque ya no me queda espacio
rm(palabras_eliminar)
gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 2375606 126.9 3932032 210.0 3932032 210.0
Vcells 17027965 130.0 40208022 306.8 33439841 255.2
# lda_fit <- LDA(dtm, k = 20,method = "Gibbs", control = list(delta=0.6,seed = 1234))
lda_fit <- LDA(dtm, k = 20,method = "Gibbs", control = list(delta=0.1,seed = 1234))
lda_fit
saveRDS(lda_fit, 'modelos/MIA_lda20.rds') # Tarda mucho en correr, asi que guardamos los resultados
lda_fit <- read_rds('modelos/MIA_lda20.rds')
Terms <- terms(lda_fit, 10)
Terms
Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6 Topic 7 Topic 8 Topic 9 Topic 10
[1,] "hombre" "trotsky" "campesinos" "produccion" "gobierno" "casa" "oposicion" "stalin" "marx" "historia"
[2,] "hombres" "escritos" "obreros" "trabajo" "politica" "noche" "escritos" "trotsky" "engels" "sino"
[3,] "ser" "carta" "tierra" "economia" "lucha" "anos" "internacional" "partido" "rusia" "sociedad"
[4,] "mujeres" "lenin" "campesino" "industria" "imperialismo" "gente" "izquierda" "lenin" "alemania" "ser"
[5,] "vida" "escrito" "trabajo" "capitalista" "pais" "tiempo" "comintern" "oposicion" "general" "social"
[6,] "mujer" "moscu" "tierras" "capital" "masas" "hombres" "politica" "anos" "movimiento" "pensamiento"
[7,] "todas" "gpu" "rusia" "capitalismo" "nacional" "podia" "burocracia" "moscu" "edicion" "naturaleza"
[8,] "moral" "mexico" "terratenientes" "desarrollo" "movimiento" "ciudad" "cion" "sovietica" "internacional" "vida"
[9,] "sociedad" "libro" "pueblo" "mercado" "frente" "cinco" "comunista" "comite" "primera" "realidad"
[10,] "pueblo" "articulo" "ley" "economica" "popular" "hombre" "trotsky" "central" "aleman" "mundo"
Topic 11 Topic 12 Topic 13 Topic 14 Topic 15 Topic 16 Topic 17 Topic 18 Topic 19
[1,] "partido" "iglesia" "chile" "revolucion" "social" "gobierno" "china" "guerra" "trabajadores"
[2,] "organizacion" "siglo" "gobierno" "proletariado" "trabajadores" "espana" "revolucion" "alemania" "clase"
[3,] "internacional" "historia" "pueblo" "clase" "nacional" "revolucion" "pueblo" "internacional" "movimiento"
[4,] "comite" "religion" "pais" "lucha" "sistema" "republica" "partido" "europa" "capitalismo"
[5,] "politica" "tierra" "trabajadores" "masas" "economica" "espanola" "lucha" "paises" "anos"
[6,] "camaradas" "america" "popular" "burguesia" "trabajo" "movimiento" "socialista" "sovietica" "trabajadora"
[7,] "comunista" "mundo" "presidente" "poder" "politica" "frente" "pais" "mundial" "ser"
[8,] "trabajo" "anos" "companeros" "partido" "derecho" "espanol" "socialismo" "francia" "gente"
[9,] "congreso" "nota" "millones" "obreros" "administracion" "poum" "mao" "paz" "incluso"
[10,] "camarada" "oro" "unidad" "obrera" "produccion" "popular" "nacional" "hitler" "mundo"
Topic 20
[1,] "ejercito"
[2,] "frente"
[3,] "rojo"
[4,] "militares"
[5,] "guerra"
[6,] "militar"
[7,] "poder"
[8,] "sovietico"
[9,] "pueblo"
[10,] "ser"
diccionario <- tibble(
n_topico = 1:20,
# nombre_topico = c('nolabel1','Lenin','mitin','Hombre&Sociedad','Chile','Gramsci','interna_bolchevique','GPU','ANAPO','Judiciales','Partido','Bolivia','Islam','nolabel2','Economia','España','China','Lula','nolabel3','Ejercito rojo')
nombre_topico = c('Humanidad','Trotksy','Campesinos','Economia','Imperialismo','tiempo&lugar','oposicion','Stalin','Marx','Historia', 'Partido','Iglesia', 'Chile','Revolucion', 'produccion','España', 'China','2da_Guerra', 'Clase_obrera','Ejercito_rojo')
)
Visualizacion
topicmodels_json_ldavis <- function(fitted, dtm){
svd_tsne <- function(x) tsne(svd(x)$u)
# Find required quantities
phi <- as.matrix(posterior(fitted)$terms)
theta <- as.matrix(posterior(fitted)$topics)
vocab <- colnames(phi)
term_freq <- slam::col_sums(dtm)
# Convert to json
json_lda <- LDAvis::createJSON(phi = phi, theta = theta,
vocab = vocab,
mds.method = svd_tsne,
plot.opts = list(xlab="tsne", ylab=""),
doc.length = as.vector(table(dtm$i)),
term.frequency = term_freq)
return(json_lda)
}
json_res <- topicmodels_json_ldavis(lda_fit, dtm)
serVis(json_res,as.gist = T,open.browser = T)
dist_topicos <- df %>%
select(autor, fecha) %>%
bind_cols(as_tibble(as.matrix(posterior(lda_fit)$topics)))
names(dist_topicos)[3:22] <- diccionario$nombre_topico
dist_topicos_autor <- dist_topicos %>%
group_by(autor) %>%
summarise_all(~mean(.x, na.rm = T)) %>%
mutate(fecha=round(fecha))
dist_topicos_autor %>%
filter(!is.na(fecha)) %>%
datatable(., filter = 'top',extensions = 'Buttons', options = list(dom = 'Bfrtip', buttons = c('excel', "csv", "copy", "pdf"), pageLength = 20, autoWidth = TRUE),rownames= FALSE) %>%
formatPercentage(diccionario$nombre_topico, 2) %>%
formatStyle(diccionario$nombre_topico, background = styleColorBar(c(0,1), 'deepskyblue')) %>%
formatStyle(diccionario$nombre_topico, background = styleColorBar(c(0,1), 'deepskyblue')) %>%
formatStyle(diccionario$nombre_topico,
# formatPercentage(3:22, 2) %>%
# formatStyle(3:22, background = styleColorBar(c(0,1), 'deepskyblue')) %>%
# formatStyle(3:22,
backgroundSize = '98% 60%',
backgroundRepeat = 'no-repeat',
backgroundPosition = 'center')
NA
topicos_tsne <- tsne(dist_topicos_autor[3:22],k = 2)
topicos_tsne <- as_tibble(topicos_tsne,.name_repair = ~glue('tsne_proj_{c(1,2)}'))
grafico <- dist_topicos_autor %>% bind_cols(topicos_tsne) %>%
ggplot(aes(tsne_proj_1,tsne_proj_2, label=autor, color=fecha)) +
geom_text()+
theme_minimal()+
scale_color_gradient2(low = 'darkorange3',mid ='darkgreen' , high = 'dodgerblue', midpoint = 1850)
plotly::ggplotly(grafico)
dist_topicos_autor %>% bind_cols(topicos_tsne) %>%
filter(!is.na(fecha), fecha>1800) %>%
ggplot(aes(tsne_proj_1,tsne_proj_2, label=autor, color=fecha)) +
geom_text()+
theme_minimal()+
scale_color_gradient2(low = 'darkorange3',mid ='darkgreen' , high = 'dodgerblue', midpoint = 1940)
topicos_pca <- princomp(dist_topicos_autor[3:22], cor = TRUE)
dist_topicos_autor %>% bind_cols(as_tibble(topicos_pca$scores[,1:2])) %>%
ggplot(aes(Comp.1,Comp.2, label=autor, color = fecha)) +
geom_text()+
theme_minimal()+
scale_color_gradient2(low = 'darkorange3',mid ='darkgreen' , high = 'dodgerblue', midpoint = 1850)
NA
NA
dist_topicos_autor %>% bind_cols(as_tibble(topicos_pca$scores[,1:2])) %>%
filter(!is.na(fecha),fecha>1800) %>%
ggplot(aes(Comp.1,Comp.2, label=autor, color = fecha)) +
geom_text()+
theme_minimal()+
scale_color_gradient2(low = 'darkorange3',mid ='darkgreen' , high = 'dodgerblue', midpoint = 1940)
k=10
pam_clusters <- pam(dist_topicos_autor[3:22],k = k,diss = F,stand = T, )
summary(pam_clusters)
Medoids:
ID Humanidad Trotksy Campesinos Economia Imperialismo tiempo&lugar oposicion Stalin Marx Historia Partido
[1,] 190 0.09206349 0.002884813 0.004086987 0.01892440 0.005045016 0.011762685 0.012377926 0.002539253 0.004796462 0.76591316 0.006819386
[2,] 191 0.43839733 0.031890974 0.022057183 0.02711177 0.023270285 0.023135496 0.021922394 0.022663734 0.047919252 0.06384179 0.026100855
[3,] 153 0.03883322 0.101413942 0.010880010 0.01390255 0.018427795 0.227741885 0.016609272 0.102905835 0.035790731 0.01730487 0.063559044
[4,] 184 0.02659928 0.009278781 0.015124924 0.03295832 0.257908545 0.030150986 0.006260534 0.009855075 0.012183236 0.02528189 0.076016490
[5,] 75 0.04763598 0.040763124 0.040763124 0.03732670 0.052257376 0.052257376 0.044199550 0.062566655 0.068254533 0.04651025 0.040763124
[6,] 14 0.03619621 0.003037863 0.022365822 0.13190665 0.004593827 0.004340070 0.003394276 0.004768416 0.024432062 0.09536530 0.030343363
[7,] 176 0.05999597 0.022622080 0.018319017 0.03931176 0.015371069 0.035861380 0.010122712 0.016379477 0.135015624 0.04315868 0.115023625
[8,] 112 0.02359757 0.049336949 0.027525923 0.02042388 0.020784529 0.029617672 0.020423883 0.052377143 0.021000917 0.05024664 0.127680086
[9,] 200 0.03457500 0.019025973 0.028659160 0.02623429 0.016196290 0.009158635 0.017732643 0.019726407 0.021939945 0.05145205 0.097395054
[10,] 108 0.03146336 0.011310510 0.039974844 0.03487473 0.023827376 0.019087013 0.008934409 0.039216298 0.044600218 0.01065911 0.073462224
Iglesia Chile Revolucion produccion España China 2da_Guerra Clase_obrera Ejercito_rojo
[1,] 0.005216397 0.004390462 0.01079546 0.01946653 0.003238765 0.002747551 0.005287764 0.01976730 0.001876187
[2,] 0.024483386 0.028527058 0.02293331 0.05292388 0.022461550 0.021922394 0.029740159 0.02239416 0.026303038
[3,] 0.016386924 0.008493464 0.12946779 0.01322820 0.018184331 0.076758531 0.020200075 0.04350941 0.026402110
[4,] 0.011238688 0.089958911 0.15182380 0.06896957 0.014612939 0.052187843 0.033022710 0.05139565 0.025171824
[5,] 0.054508828 0.043073824 0.04307382 0.03732670 0.054568077 0.056819528 0.057945254 0.06831378 0.051072402
[6,] 0.005048823 0.006402463 0.35186079 0.09681440 0.006849645 0.006689822 0.021539684 0.13569123 0.008359277
[7,] 0.019035916 0.021786551 0.20939818 0.03971192 0.024011686 0.017993890 0.068750129 0.04444687 0.043683463
[8,] 0.026588243 0.028535733 0.12785772 0.03463335 0.219994768 0.031060257 0.024968026 0.04270644 0.020640271
[9,] 0.013688183 0.019261473 0.03802019 0.03147106 0.016893661 0.370170587 0.020462241 0.12014872 0.027788437
[10,] 0.010272703 0.022872172 0.25853970 0.03520687 0.026436144 0.052622441 0.207909143 0.01620995 0.032520789
Clustering vector:
[1] 1 2 3 4 2 5 3 6 7 5 7 5 8 6 4 7 9 4 5 2 5 3 6 6 6 4 4 4 5 7 9 9 5 9 9 6 5 2 6 6 2 5 9 8 6 6 3
[48] 2 8 9 10 6 1 8 5 2 7 2 5 4 6 8 2 7 7 7 7 6 1 10 4 4 2 8 5 4 4 6 6 7 6 8 6 10 9 9 4 9 9 4 5 5 6 6
[95] 9 3 2 3 3 8 9 10 8 3 4 4 4 10 4 8 6 8 3 8 7 7 10 7 9 6 6 8 10 7 3 5 9 10 3 5 9 9 9 5 8 5 4 2 2 9 7
[142] 6 4 7 6 1 5 2 5 2 3 4 3 8 8 8 6 1 4 7 6 3 2 7 9 2 6 2 5 8 3 2 9 9 10 7 7 3 5 10 4 5 4 4 9 9 8 8
[189] 10 1 2 2 6 3 5 6 5 4 5 9 5 9 5 2 10 9 10 6 9 9 9
Objective function:
build swap
5.393865 5.315724
Numerical information per cluster:
size max_diss av_diss diameter separation
[1,] 6 5.908814 2.590574 7.297411 5.021292
[2,] 22 7.095931 4.428404 11.142928 2.164822
[3,] 17 13.283751 7.549457 21.669889 4.689612
[4,] 25 20.371400 5.695338 28.185803 4.266684
[5,] 28 17.778057 5.838546 24.292393 2.164822
[6,] 30 11.837765 5.939195 16.675037 3.155592
[7,] 20 9.104322 4.913987 13.808223 2.955347
[8,] 20 10.652164 5.020015 15.404674 4.130236
[9,] 30 7.218954 4.384370 10.371582 3.155592
[10,] 13 11.765190 5.081457 14.386499 3.669479
Isolated clusters:
L-clusters: character(0)
L*-clusters: character(0)
Silhouette plot information:
cluster neighbor sil_width
190 1 2 0.749577066
146 1 2 0.749291017
158 1 7 0.740853172
53 1 7 0.716520889
69 1 2 0.620368332
1 1 2 0.333200634
48 2 9 0.415316631
191 2 7 0.405032593
63 2 9 0.397267702
148 2 9 0.374234585
166 2 9 0.366677819
97 2 9 0.359085682
20 2 7 0.333337822
163 2 9 0.331961505
5 2 9 0.311557707
58 2 6 0.279566478
73 2 5 0.229832724
150 2 7 0.219301310
192 2 9 0.218974743
168 2 5 0.213906430
38 2 9 0.209399382
41 2 7 0.182859559
204 2 5 0.182322820
56 2 9 0.179497319
139 2 5 0.171837563
172 2 9 0.104032587
138 2 9 0.092578130
2 2 9 0.051590972
194 3 2 0.159503337
151 3 5 0.121612410
104 3 2 0.120839046
7 3 2 0.117860848
96 3 2 0.100701037
178 3 2 0.092367567
125 3 7 0.080222329
47 3 2 0.073244028
99 3 2 0.058877549
98 3 5 0.058420192
153 3 7 0.045660822
171 3 5 0.032128908
3 3 9 0.003151756
113 3 4 -0.021278258
129 3 8 -0.041837808
22 3 2 -0.052099655
162 3 9 -0.069924710
137 4 9 0.289183159
76 4 9 0.268134976
109 4 9 0.260162631
105 4 9 0.257682681
18 4 9 0.225181660
106 4 9 0.224478959
184 4 9 0.223440034
77 4 9 0.216324307
26 4 2 0.204839672
27 4 9 0.195311510
71 4 9 0.191133720
143 4 9 0.187093689
4 4 9 0.166373642
107 4 9 0.149596306
72 4 9 0.147955095
181 4 9 0.084593088
198 4 2 0.052254578
87 4 2 0.032952110
28 4 2 0.023097691
60 4 9 0.020244026
152 4 9 0.019320806
15 4 5 0.017692385
90 4 9 0.007231498
183 4 9 -0.040009978
159 4 9 -0.163743956
136 5 7 0.152870572
21 5 2 0.152832793
6 5 2 0.146620935
75 5 7 0.133063047
29 5 2 0.132457724
179 5 2 0.127757533
92 5 2 0.119710657
147 5 2 0.118908926
195 5 2 0.113164895
203 5 7 0.097813714
33 5 9 0.096676622
126 5 9 0.057676240
201 5 2 0.022767709
55 5 2 0.009228322
197 5 7 -0.006201649
149 5 1 -0.021835351
12 5 8 -0.022359231
10 5 7 -0.023682676
59 5 9 -0.025732033
134 5 9 -0.053598074
182 5 7 -0.053796722
130 5 9 -0.053865116
169 5 4 -0.063066529
199 5 7 -0.091101595
37 5 7 -0.124560792
91 5 9 -0.130033925
42 5 2 -0.136216144
19 5 2 -0.151071077
79 6 9 0.213677306
14 6 9 0.177296463
145 6 9 0.176275752
167 6 9 0.155594991
81 6 7 0.133548389
161 6 9 0.125680272
45 6 9 0.124750321
36 6 9 0.103873342
93 6 9 0.103036123
142 6 2 0.095261870
196 6 10 0.086468296
83 6 7 0.065909884
46 6 9 0.040714760
193 6 9 0.035691116
120 6 9 0.013721047
121 6 9 0.013697251
94 6 9 0.013525804
61 6 9 0.012805217
157 6 9 0.003763680
8 6 9 -0.008983622
68 6 7 -0.050745880
208 6 9 -0.057041075
111 6 9 -0.061248579
52 6 7 -0.111673730
39 6 2 -0.123147475
25 6 7 -0.129219809
40 6 7 -0.135052521
78 6 9 -0.160410556
24 6 1 -0.210651394
23 6 9 -0.222339184
176 7 9 0.265467350
116 7 9 0.250021209
144 7 2 0.227185401
66 7 2 0.221803057
124 7 2 0.211641358
65 7 9 0.177325299
164 7 9 0.164442906
115 7 9 0.158151067
177 7 2 0.121297369
67 7 9 0.102837743
64 7 5 0.098476006
57 7 9 0.086447153
80 7 9 0.072190933
30 7 2 0.071657579
160 7 10 0.038262536
16 7 9 0.033835533
9 7 9 0.019352689
141 7 2 0.005250102
118 7 10 -0.034150051
11 7 2 -0.035797783
156 8 9 0.392337835
114 8 9 0.389624622
103 8 9 0.373083905
155 8 9 0.352480698
74 8 9 0.341282079
112 8 9 0.313335803
62 8 9 0.285557264
82 8 9 0.265594385
44 8 2 0.220317875
135 8 9 0.214820107
170 8 7 0.194611548
49 8 9 0.176373777
188 8 9 0.137208195
110 8 9 0.130800133
100 8 9 0.128089640
13 8 7 0.090674061
187 8 9 0.080111567
54 8 9 0.053144331
154 8 9 0.016655581
122 8 9 -0.030398958
132 9 7 0.463733601
86 9 2 0.444720038
202 9 2 0.426796810
131 9 7 0.421939921
32 9 7 0.413211152
173 9 2 0.397148676
200 9 7 0.395372356
209 9 6 0.394747092
186 9 7 0.383757292
31 9 7 0.373436378
50 9 10 0.369857766
140 9 7 0.339410271
101 9 7 0.329533617
174 9 7 0.322529017
43 9 2 0.311565183
210 9 2 0.300068120
211 9 7 0.296122084
127 9 7 0.291167536
88 9 7 0.288304121
35 9 6 0.282413998
85 9 2 0.271743291
206 9 6 0.259605192
89 9 7 0.243541976
34 9 7 0.237968596
119 9 6 0.220426883
185 9 10 0.212704986
133 9 7 0.153565036
165 9 7 0.075014445
95 9 7 0.071524625
17 9 2 0.022334773
180 10 9 0.355317929
128 10 7 0.345488497
102 10 7 0.335111095
123 10 7 0.264209550
108 10 7 0.254358106
70 10 7 0.210412568
117 10 7 0.196580720
207 10 9 0.184807587
175 10 9 0.133433034
189 10 9 0.086183036
205 10 9 0.082997120
51 10 9 -0.022593146
84 10 7 -0.123964523
Average silhouette width per cluster:
[1] 0.65163519 0.25591691 0.05173232 0.13042097 0.01872960 0.01415927 0.11278487 0.20628522 0.30047549 0.17710320
Average silhouette width of total data set:
[1] 0.1532093
Available components:
[1] "medoids" "id.med" "clustering" "objective" "isolation" "clusinfo" "silinfo" "diss" "call" "data"
plot(silhouette(pam_clusters),col=1:k)
ggplotly(dist_topicos_autor[pam_clusters$id.med,] %>%
gather(topico, valor, 3:22) %>%
ggplot(aes(topico, valor,fill=topico)) +
geom_col() +
facet_wrap(.~autor)+
theme(legend.position = 'none'))
plot <- dist_topicos %>%
select(-autor) %>%
mutate(fecha= round(fecha, -1)) %>%
group_by(fecha) %>%
summarise_all(~mean(.x, na.rm = T)) %>%
filter(!is.na(fecha), fecha>1800) %>%
gather(topico, valor,2:21) %>%
mutate(topico=factor(topico)) %>%
ggplot(aes(fecha, valor, color=topico))+
geom_line()+
scale_x_continuous(breaks = scales::pretty_breaks(10))+
# directlabels::geom_dl(aes(label = topico), method=list("top.qp", cex = .75))+
theme_minimal()
theme(legend.position = 'none')
List of 1
$ legend.position: chr "none"
- attr(*, "class")= chr [1:2] "theme" "gg"
- attr(*, "complete")= logi FALSE
- attr(*, "validate")= logi TRUE
plotly::ggplotly(plot)
obs: El dataset tiene mucho de Allende, Tópico 6. Tal vez habria que subsamplear.
Terms
Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6 Topic 7 Topic 8 Topic 9 Topic 10
[1,] "hombre" "trotsky" "campesinos" "produccion" "gobierno" "casa" "oposicion" "stalin" "marx" "historia"
[2,] "hombres" "escritos" "obreros" "trabajo" "politica" "noche" "escritos" "trotsky" "engels" "sino"
[3,] "ser" "carta" "tierra" "economia" "lucha" "anos" "internacional" "partido" "rusia" "sociedad"
[4,] "mujeres" "lenin" "campesino" "industria" "imperialismo" "gente" "izquierda" "lenin" "alemania" "ser"
[5,] "vida" "escrito" "trabajo" "capitalista" "pais" "tiempo" "comintern" "oposicion" "general" "social"
[6,] "mujer" "moscu" "tierras" "capital" "masas" "hombres" "politica" "anos" "movimiento" "pensamiento"
[7,] "todas" "gpu" "rusia" "capitalismo" "nacional" "podia" "burocracia" "moscu" "edicion" "naturaleza"
[8,] "moral" "mexico" "terratenientes" "desarrollo" "movimiento" "ciudad" "cion" "sovietica" "internacional" "vida"
[9,] "sociedad" "libro" "pueblo" "mercado" "frente" "cinco" "comunista" "comite" "primera" "realidad"
[10,] "pueblo" "articulo" "ley" "economica" "popular" "hombre" "trotsky" "central" "aleman" "mundo"
Topic 11 Topic 12 Topic 13 Topic 14 Topic 15 Topic 16 Topic 17 Topic 18 Topic 19
[1,] "partido" "iglesia" "chile" "revolucion" "social" "gobierno" "china" "guerra" "trabajadores"
[2,] "organizacion" "siglo" "gobierno" "proletariado" "trabajadores" "espana" "revolucion" "alemania" "clase"
[3,] "internacional" "historia" "pueblo" "clase" "nacional" "revolucion" "pueblo" "internacional" "movimiento"
[4,] "comite" "religion" "pais" "lucha" "sistema" "republica" "partido" "europa" "capitalismo"
[5,] "politica" "tierra" "trabajadores" "masas" "economica" "espanola" "lucha" "paises" "anos"
[6,] "camaradas" "america" "popular" "burguesia" "trabajo" "movimiento" "socialista" "sovietica" "trabajadora"
[7,] "comunista" "mundo" "presidente" "poder" "politica" "frente" "pais" "mundial" "ser"
[8,] "trabajo" "anos" "companeros" "partido" "derecho" "espanol" "socialismo" "francia" "gente"
[9,] "congreso" "nota" "millones" "obreros" "administracion" "poum" "mao" "paz" "incluso"
[10,] "camarada" "oro" "unidad" "obrera" "produccion" "popular" "nacional" "hitler" "mundo"
Topic 20
[1,] "ejercito"
[2,] "frente"
[3,] "rojo"
[4,] "militares"
[5,] "guerra"
[6,] "militar"
[7,] "poder"
[8,] "sovietico"
[9,] "pueblo"
[10,] "ser"
dist_topicos_autor <- dist_topicos %>%
group_by(autor) %>%
summarise_all(~mean(.x, na.rm = T)) %>%
mutate(fecha=round(fecha))
adjMat = cosine(t(as.matrix(dist_topicos_autor[,3:21])))
colnames(adjMat) <- dist_topicos_autor$autor
rownames(adjMat) <- dist_topicos_autor$autor
adjMat[1:5,1:5]
Albert Einstein Albert Mathiez Albert Rhys Williams Alberto Flores Galindo Alejandra Kollontai
Albert Einstein 1.0000000 0.2806837 0.1178042 0.3402257 0.4083771
Albert Mathiez 0.2806837 1.0000000 0.3282932 0.3041627 0.8507887
Albert Rhys Williams 0.1178042 0.3282932 1.0000000 0.2903798 0.3500499
Alberto Flores Galindo 0.3402257 0.3041627 0.2903798 1.0000000 0.3070904
Alejandra Kollontai 0.4083771 0.8507887 0.3500499 0.3070904 1.0000000
fivenum(adjMat)
[1] 0.009840354 0.231637403 0.367635382 0.541784794 1.000000000
#la paso a dicotomica, no quiero que me quede muy densa, asi que pongo como punto de corte un valor alto
adjMat[adjMat>0.75] <- 1
adjMat[adjMat<0.75] <- 0
adjMat[1:5,1:5]
Albert Einstein Albert Mathiez Albert Rhys Williams Alberto Flores Galindo Alejandra Kollontai
Albert Einstein 1 0 0 0 0
Albert Mathiez 0 1 0 0 1
Albert Rhys Williams 0 0 1 0 0
Alberto Flores Galindo 0 0 0 1 0
Alejandra Kollontai 0 1 0 0 1
g = graph_from_adjacency_matrix(adjMat, weighted= NULL, mode="undirected", diag=FALSE)
V(g)$fecha <- dist_topicos_autor$fecha #agrego la fecha como atributo de cada autor
mean(degree(g))
[1] 15.41232
l <- layout_nicely(g)
plot(g,edge.arrow.size=.2, vertex.size=4,vertex.frame.color="#ffffff",
vertex.label="", vertex.label.color="black",
layout=l)
as_tbl_graph(g) %>%
filter(fecha>1800) %>%
activate(nodes) %>%
mutate(importance = centrality_degree()) %>%
filter(importance >1) %>%
# as_tbl_graph(g) %>%
# filter(fecha>1800,
# !degree(g)<2) %>%
ggraph() +
geom_edge_link(color='grey') +
geom_node_point(aes(color=fecha))+
geom_node_text(aes(label=name),check_overlap = T,nudge_y =-.5 ) +
theme_void()+
scale_color_gradient2(low = 'darkorange3',mid ='darkgreen' , high = 'dodgerblue', midpoint = 1900)
as_tbl_graph(g) %>%
filter(fecha>1800) %>%
activate(nodes) %>%
# as_tbl_graph(g) %>%
# filter(fecha>1800,
# !degree(g)<2) %>%
ggraph() +
geom_edge_link(color='grey') +
geom_node_point(aes(color=fecha))+
geom_node_text(aes(label=name),check_overlap = T,nudge_y =-.5 ) +
theme_void()+
scale_color_gradient2(low = 'darkorange3',mid ='darkgreen' , high = 'dodgerblue', midpoint = 1900)
El mismo gráfico, de forma interactiva (no esta implementada la visualización de los links)
grafo_plot <- as_tbl_graph(g) %>%
filter(fecha>1800) %>%
activate(nodes) %>%
# as_tbl_graph(g) %>%
# filter(fecha>1800,
# !degree(g)<2) %>%
ggraph() +
geom_edge_link(color='grey') +
geom_node_point(aes(color=fecha,label=name))+
# geom_node_text(aes(label=name),check_overlap = T,nudge_y =-.5 ) +
theme_void()+
scale_color_gradient2(low = 'darkorange3',mid ='darkgreen' , high = 'dodgerblue', midpoint = 1900)
plotly::ggplotly(grafo_plot)
comunities_louvain <- as_tbl_graph(g) %>%
mutate(community = as.factor(group_louvain()))
comunities_louvain %>%
ggraph(layout = 'kk') +
geom_edge_link(aes(alpha = ..index..), show.legend = FALSE) +
geom_node_point(aes(colour = community,label=name), size = 2) +
theme_graph()
ggplotly(comunities_louvain %>%
ggraph(layout = 'kk') +
geom_edge_link(aes(alpha = ..index..), show.legend = FALSE) +
geom_node_point(aes(colour = community,label=name), size = 2) +
theme_graph() )
ggplotly(comunities_louvain %>%
vertex_attr() %>%
bind_rows() %>%
select(autor = name, community) %>%
left_join(dist_topicos_autor) %>%
group_by(community) %>%
summarise_if(is.numeric, mean, na.rm = TRUE) %>%
gather(topico, valor, 3:22) %>%
mutate(fecha = round(fecha)) %>%
ggplot(aes(topico, valor,fill=topico, label=topico))+
geom_col(position = position_dodge())+
facet_wrap(community~fecha, scales = 'free', labeller = label_both) +
theme_minimal()+
theme(legend.position = 'none',
axis.text.x = element_blank()))
NA
NA
NA