Topic modelling med gensim og dhlab

2.2. Topic modelling med gensim og dhlab#

import dhlab as dh
import pandas as pd
import gensim
import pyLDAvis
from pprint import pprint
import pyLDAvis.gensim_models as genvis
# sprit urn URN:NBN:no-nb_digibok_2020090207537

Finn en bok

bok = dh.Corpus(doctype="digibok", limit=1, title="Norgeshistorie")
bok
dhlabid urn title authors oaiid sesamid isbn10 city timestamp year publisher langs subjects ddc genres literaryform doctype ocr_creator ocr_timestamp
0 100313140 URN:NBN:no-nb_digibok_2016101748078 Nordmenn før oss : norgeshistorie for gymnaset Bull , Edvard oai:nb.bibsys.no:999822590504702202 25dce22c432fc70670f9d4a3fdf34355 Oslo 19590101 1959 Johan Grundt Tanum nob norge / historie 948.1 Faglitteratur digibok nb 20060101
urn = bok.frame.urn.tolist()[0]
urn
'URN:NBN:no-nb_digibok_2016101748078'

2.2.1. Chunking#

# Chunks 
res = dh.Chunks(chunks=1000, urn=urn)
len(res.chunks)
146
def chunks_to_corpus(chunks_list):
    res = []
    for x in chunks_list:
        inner_res = ""
        for y in x:
            inner_res += (y + " ") * x[y]
            
        res.append(inner_res)
    return res
texts = chunks_to_corpus(res.chunks)

2.2.2. Find delta TFIDF#

df = pd.DataFrame(res.chunks).transpose().fillna(0)
df
0 1 2 3 4 5 6 7 8 9 ... 136 137 138 139 140 141 142 143 144 145
! 1.0 0.0 1.0 0.0 2.0 0.0 0.0 0.0 0.0 2.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 6.0 0.0 0.0
' 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 1.0 0.0 0.0 3.0 0.0 0.0
( 3.0 1.0 1.0 2.0 2.0 2.0 2.0 1.0 1.0 0.0 ... 2.0 0.0 1.0 1.0 2.0 0.0 3.0 4.0 0.0 2.0
) 11.0 6.0 6.0 7.0 4.0 2.0 7.0 11.0 10.0 5.0 ... 13.0 4.0 6.0 4.0 2.0 6.0 15.0 15.0 6.0 18.0
* 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 2.0 1.0 0.0 0.0 0.0 3.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
uskåret 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
varefeller 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
vevnad 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
vevstolen 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
Ål 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0

18248 rows × 146 columns

# Import top 50000 tokens from NB collection
tot = dh.totals(50000)
# Divide corpus freq count by tot
res = df.sum(axis=1) /  tot.freq
# Get top 1000 more frequent tokens
target_tokens = res.sort_values(ascending=False).iloc[:1000].dropna().index
df.loc[target_tokens]
0 1 2 3 4 5 6 7 8 9 ... 136 137 138 139 140 141 142 143 144 145
kongene 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0
embetsmenn 3.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 ... 1.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
Bøndene 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 1.0 2.0 0.0 1.0 1.0 0.0 0.0 1.0 0.0
Grunnloven 7.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
reformasjonen 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
fins 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0
blokade 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
paragrafer 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
illegale 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
titelen 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

1000 rows × 146 columns

outer_lst = []

for col in df.loc[target_tokens].columns:
    inner_str = ""
    for i, x in df.loc[target_tokens][col].items():
        if x > 0:
            inner_str += (i + " ") * int(x)
        
    outer_lst.append(inner_str)
len(outer_lst)
146

2.2.3. Prep for LDA#

data = [x.split() for x in outer_lst]
id2word = gensim.corpora.Dictionary(data)
corpus = [id2word.doc2bow(chunk) for chunk in data]

2.2.4. Make model#

lda_model = gensim.models.LdaModel(
    corpus = corpus,
    id2word = id2word,
    num_topics = 10
)
pprint(lda_model.print_topics())
[(0,
  '0.024*"Sverige" + 0.023*"Danmark" + 0.022*"bøndene" + 0.021*"konge" + '
  '0.020*"kongen" + 0.019*"Kristian" + 0.018*"Stortinget" + 0.015*"bønder" + '
  '0.014*"sjøl" + 0.014*"svenske"'),
 (1,
  '0.021*"bøte" + 0.018*"bøndene" + 0.015*"skip" + 0.011*"Danmark" + '
  '0.011*"Kap" + 0.011*"kongen" + 0.009*"Sverige" + 0.009*"Stortinget" + '
  '0.009*"Sverdrup" + 0.009*"svenske"'),
 (2,
  '0.056*"Kristian" + 0.024*"Fredrik" + 0.021*"kongen" + 0.021*"Sverige" + '
  '0.020*"konge" + 0.016*"sjøl" + 0.014*"regjeringen" + 0.013*"Danmark" + '
  '0.011*"krigen" + 0.010*"danske"'),
 (3,
  '0.021*"arbeiderne" + 0.014*"krigen" + 0.013*"Eder" + 0.012*"skip" + '
  '0.012*"kongen" + 0.010*"Danmark" + 0.010*"byene" + 0.009*"sjøl" + '
  '0.009*"Sverige" + 0.008*"borgere"'),
 (4,
  '0.030*"regjeringen" + 0.022*"Sverige" + 0.017*"kongen" + 0.014*"England" + '
  '0.012*"svenske" + 0.012*"krigen" + 0.012*"Sverdrup" + 0.009*"Kristian" + '
  '0.009*"Stortinget" + 0.009*"riksdagen"'),
 (5,
  '0.043*"Danmark" + 0.026*"Sverige" + 0.022*"kongen" + 0.020*"krigen" + '
  '0.016*"bøndene" + 0.014*"krig" + 0.014*"danske" + 0.014*"konge" + '
  '0.014*"England" + 0.010*"makt"'),
 (6,
  '0.027*"Sverige" + 0.013*"arbeiderne" + 0.013*"svenske" + 0.012*"Danmark" + '
  '0.010*"bøndene" + 0.010*"danske" + 0.010*"England" + 0.009*"revolusjon" + '
  '0.008*"industrien" + 0.008*"sjøl"'),
 (7,
  '0.027*"bøndene" + 0.019*"sjøl" + 0.019*"regjeringen" + 0.012*"bønder" + '
  '0.011*"rike" + 0.011*"København" + 0.011*"tyskerne" + 0.010*"skip" + '
  '0.010*"Rig" + 0.010*"Danmark"'),
 (8,
  '0.023*"bøndene" + 0.014*"industrien" + 0.012*"bønder" + 0.011*"kongen" + '
  '0.010*"1850" + 0.009*"uti" + 0.009*"bøte" + 0.009*"drapsmannen" + '
  '0.009*"skip" + 0.008*"sjøl"'),
 (9,
  '0.029*"kongen" + 0.026*"bøndene" + 0.025*"Sverige" + 0.024*"Stortinget" + '
  '0.020*"svenske" + 0.014*"Danmark" + 0.014*"makt" + 0.013*"sjøl" + '
  '0.013*"regjeringen" + 0.011*"krigen"')]
prep = genvis.prepare(lda_model, corpus, id2word)
# Save to html
# pyLDAvis.save_html(prep, "result.html")
pyLDAvis.enable_notebook()
pyLDAvis.display(prep)