Topic modelling med gensim og dhlab

2.2. Topic modelling med gensim og dhlab#

import dhlab as dh
import pandas as pd
import gensim
import pyLDAvis
from pprint import pprint
import pyLDAvis.gensim_models as genvis
# sprit urn URN:NBN:no-nb_digibok_2020090207537

Finn en bok

bok = dh.Corpus(doctype="digibok", limit=1, title="Norgeshistorie")
bok
dhlabid urn title authors oaiid sesamid isbn10 city timestamp year publisher langs subjects ddc genres literaryform doctype ocr_creator ocr_timestamp
0 100492480 URN:NBN:no-nb_digibok_2016021108166 En kortfattet norgeshistorie i versform Søraas , O. oai:nb.bibsys.no:990202045744702202 93a057269a403a3c94012884ac324ac2 Trondhjem 19080101 1908 nob Uklassifisert digibok nb 20060101
urn = bok.frame.urn.tolist()[0]
urn
'URN:NBN:no-nb_digibok_2016021108166'

2.2.1. Chunking#

# Chunks 
res = dh.Chunks(chunks=1000, urn=urn)
len(res.chunks)
5
def chunks_to_corpus(chunks_list):
    res = []
    for x in chunks_list:
        inner_res = ""
        for y in x:
            inner_res += (y + " ") * x[y]
            
        res.append(inner_res)
    return res
texts = chunks_to_corpus(res.chunks)

2.2.2. Find delta TFIDF#

df = pd.DataFrame(res.chunks).transpose().fillna(0)
df
0 1 2 3 4
! 5.0 0.0 4.0 2.0 0.0
" 3.0 0.0 3.0 1.0 3.0
' 9.0 0.0 4.0 3.0 2.0
, 64.0 1.0 67.0 62.0 69.0
- 6.0 0.0 5.0 7.0 8.0
... ... ... ... ... ...
verden 0.0 0.0 0.0 0.0 1.0
verre 0.0 0.0 0.0 0.0 1.0
vilde 0.0 0.0 0.0 0.0 1.0
vokste 0.0 0.0 0.0 0.0 1.0
voldsom 0.0 0.0 0.0 0.0 1.0

1625 rows × 5 columns

# Import top 50000 tokens from NB collection
tot = dh.totals(50000)
# Divide corpus freq count by tot
res = df.sum(axis=1) /  tot.freq
# Get top 1000 more frequent tokens
target_tokens = res.sort_values(ascending=False).iloc[:1000].dropna().index
df.loc[target_tokens]
0 1 2 3 4
regjerte 1.0 0.0 1.0 1.0 0.0
adel 0.0 0.0 2.0 0.0 1.0
Kalv 0.0 0.0 0.0 1.0 2.0
storslagen 1.0 0.0 0.0 0.0 1.0
velde 0.0 0.0 1.0 0.0 1.0
... ... ... ... ... ...
3 0.0 0.0 0.0 1.0 1.0
vil 1.0 0.0 0.0 1.0 0.0
Vi 1.0 0.0 0.0 0.0 0.0
dette 0.0 0.0 1.0 0.0 0.0
har 0.0 0.0 0.0 2.0 3.0

1000 rows × 5 columns

outer_lst = []

for col in df.loc[target_tokens].columns:
    inner_str = ""
    for i, x in df.loc[target_tokens][col].items():
        if x > 0:
            inner_str += (i + " ") * int(x)
        
    outer_lst.append(inner_str)
len(outer_lst)
5

2.2.3. Prep for LDA#

data = [x.split() for x in outer_lst]
id2word = gensim.corpora.Dictionary(data)
corpus = [id2word.doc2bow(chunk) for chunk in data]

2.2.4. Make model#

lda_model = gensim.models.LdaMulticore(
    corpus = corpus,
    id2word = id2word,
    num_topics = 10
)
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=98222) is multi-threaded, use of fork() may lead to deadlocks in the child.
  self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=98222) is multi-threaded, use of fork() may lead to deadlocks in the child.
  self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=98222) is multi-threaded, use of fork() may lead to deadlocks in the child.
  self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=98222) is multi-threaded, use of fork() may lead to deadlocks in the child.
  self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=98222) is multi-threaded, use of fork() may lead to deadlocks in the child.
  self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=98222) is multi-threaded, use of fork() may lead to deadlocks in the child.
  self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=98222) is multi-threaded, use of fork() may lead to deadlocks in the child.
  self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=98222) is multi-threaded, use of fork() may lead to deadlocks in the child.
  self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=98222) is multi-threaded, use of fork() may lead to deadlocks in the child.
  self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=98222) is multi-threaded, use of fork() may lead to deadlocks in the child.
  self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=98222) is multi-threaded, use of fork() may lead to deadlocks in the child.
  self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=98222) is multi-threaded, use of fork() may lead to deadlocks in the child.
  self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=98222) is multi-threaded, use of fork() may lead to deadlocks in the child.
  self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=98222) is multi-threaded, use of fork() may lead to deadlocks in the child.
  self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=98222) is multi-threaded, use of fork() may lead to deadlocks in the child.
  self.pid = os.fork()
pprint(lda_model.print_topics())
[(0,
  '0.055*"," + 0.038*"og" + 0.023*"." + 0.017*"i" + 0.016*"han" + 0.012*"til" '
  '+ 0.012*"med" + 0.010*"var" + 0.010*"som" + 0.009*";"'),
 (1,
  '0.046*"," + 0.043*"og" + 0.025*"." + 0.019*"han" + 0.016*"i" + 0.011*"som" '
  '+ 0.011*"en" + 0.010*"med" + 0.010*"til" + 0.010*"av"'),
 (2,
  '0.023*"," + 0.020*"og" + 0.012*"." + 0.012*"i" + 0.008*";" + 0.007*"han" + '
  '0.007*"til" + 0.006*"var" + 0.006*"en" + 0.006*"som"'),
 (3,
  '0.070*"," + 0.042*"og" + 0.037*"." + 0.025*"i" + 0.020*"han" + 0.015*"med" '
  '+ 0.015*";" + 0.014*"en" + 0.013*"til" + 0.012*"av"'),
 (4,
  '0.068*"," + 0.051*"og" + 0.040*"." + 0.023*"i" + 0.019*"han" + 0.016*";" + '
  '0.016*"en" + 0.012*"paa" + 0.012*"blev" + 0.012*"med"'),
 (5,
  '0.034*"," + 0.026*"og" + 0.020*"." + 0.014*"i" + 0.013*"han" + 0.009*";" + '
  '0.009*"var" + 0.009*"den" + 0.008*"med" + 0.007*"som"'),
 (6,
  '0.027*"," + 0.018*"og" + 0.013*"." + 0.012*"i" + 0.009*"han" + 0.007*"med" '
  '+ 0.006*";" + 0.006*"til" + 0.006*"var" + 0.005*"som"'),
 (7,
  '0.066*"," + 0.028*"og" + 0.024*"i" + 0.021*"." + 0.019*"han" + 0.014*";" + '
  '0.014*"en" + 0.011*"med" + 0.010*"som" + 0.010*"til"'),
 (8,
  '0.076*"," + 0.050*"og" + 0.035*"." + 0.029*"i" + 0.023*"han" + 0.016*";" + '
  '0.015*"den" + 0.013*"som" + 0.013*"til" + 0.013*"var"'),
 (9,
  '0.029*"," + 0.022*"og" + 0.021*"." + 0.013*"han" + 0.011*"i" + 0.010*"en" + '
  '0.009*";" + 0.009*"som" + 0.008*"med" + 0.007*"av"')]
prep = genvis.prepare(lda_model, corpus, id2word)
# Save to html
# pyLDAvis.save_html(prep, "result.html")
pyLDAvis.enable_notebook()
pyLDAvis.display(prep)