{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Topic modelling med gensim og dhlab" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import dhlab as dh\n", "import pandas as pd\n", "import gensim\n", "import pyLDAvis\n", "from pprint import pprint\n", "import pyLDAvis.gensim_models as genvis" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# sprit urn URN:NBN:no-nb_digibok_2020090207537" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Finn en bok" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "bok = dh.Corpus(doctype=\"digibok\", limit=1, title=\"Norgeshistorie\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dhlabidurntitleauthorsoaiidsesamidisbn10citytimestampyearpublisherlangssubjectsddcgenresliteraryformdoctypeocr_creatorocr_timestamp
0100441759URN:NBN:no-nb_digibok_2009111200018Elevens ressursbok : verdens- og Norgeshistori...Berger , Finn / Henningsen , Rune / Aass , Oleoai:nb.bibsys.no:9997215599647022029d569a267470cda9851b718c9bdd20f08200423387199701011997Universitetsforl.nobEldre tid / Tidlig nytid / Historie / Verden /...Faglitteraturdigiboknb20060101
\n", "
" ], "text/plain": [ " dhlabid urn \\\n", "0 100441759 URN:NBN:no-nb_digibok_2009111200018 \n", "\n", " title \\\n", "0 Elevens ressursbok : verdens- og Norgeshistori... \n", "\n", " authors \\\n", "0 Berger , Finn / Henningsen , Rune / Aass , Ole \n", "\n", " oaiid sesamid \\\n", "0 oai:nb.bibsys.no:999721559964702202 9d569a267470cda9851b718c9bdd20f0 \n", "\n", " isbn10 city timestamp year publisher langs \\\n", "0 8200423387 19970101 1997 Universitetsforl. nob \n", "\n", " subjects ddc genres \\\n", "0 Eldre tid / Tidlig nytid / Historie / Verden /... \n", "\n", " literaryform doctype ocr_creator ocr_timestamp \n", "0 Faglitteratur digibok nb 20060101 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bok" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'URN:NBN:no-nb_digibok_2009111200018'" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "urn = bok.frame.urn.tolist()[0]\n", "urn" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Chunking" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# Chunks \n", "res = dh.Chunks(chunks=1000, urn=urn)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "31" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(res.chunks)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def chunks_to_corpus(chunks_list):\n", " res = []\n", " for x in chunks_list:\n", " inner_res = \"\"\n", " for y in x:\n", " inner_res += (y + \" \") * x[y]\n", " \n", " res.append(inner_res)\n", " return res" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "texts = chunks_to_corpus(res.chunks)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Find delta TFIDF" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame(res.chunks).transpose().fillna(0)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789...21222324252627282930
(8.00.04.01.04.04.02.04.011.04.0...2.03.02.06.04.01.05.03.02.01.0
)8.00.04.01.04.04.02.04.011.04.0...2.03.02.05.05.01.05.03.02.01.0
,49.036.044.060.043.039.055.042.020.055.0...38.039.040.045.058.038.032.041.042.038.0
-8.05.02.05.04.03.03.08.03.06.0...4.010.09.01.02.01.05.020.03.00.0
.22.038.025.027.025.025.037.024.032.036.0...27.046.032.041.017.038.036.031.041.040.0
..................................................................
verdighet0.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.01.0
virkelige0.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.01.0
Ørkenen0.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.01.0
årer0.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.01.0
ødela0.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.01.0
\n", "

6844 rows × 31 columns

\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8 9 ... \\\n", "( 8.0 0.0 4.0 1.0 4.0 4.0 2.0 4.0 11.0 4.0 ... \n", ") 8.0 0.0 4.0 1.0 4.0 4.0 2.0 4.0 11.0 4.0 ... \n", ", 49.0 36.0 44.0 60.0 43.0 39.0 55.0 42.0 20.0 55.0 ... \n", "- 8.0 5.0 2.0 5.0 4.0 3.0 3.0 8.0 3.0 6.0 ... \n", ". 22.0 38.0 25.0 27.0 25.0 25.0 37.0 24.0 32.0 36.0 ... \n", "... ... ... ... ... ... ... ... ... ... ... ... \n", "verdighet 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... \n", "virkelige 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... \n", "Ørkenen 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... \n", "årer 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... \n", "ødela 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... \n", "\n", " 21 22 23 24 25 26 27 28 29 30 \n", "( 2.0 3.0 2.0 6.0 4.0 1.0 5.0 3.0 2.0 1.0 \n", ") 2.0 3.0 2.0 5.0 5.0 1.0 5.0 3.0 2.0 1.0 \n", ", 38.0 39.0 40.0 45.0 58.0 38.0 32.0 41.0 42.0 38.0 \n", "- 4.0 10.0 9.0 1.0 2.0 1.0 5.0 20.0 3.0 0.0 \n", ". 27.0 46.0 32.0 41.0 17.0 38.0 36.0 31.0 41.0 40.0 \n", "... ... ... ... ... ... ... ... ... ... ... \n", "verdighet 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 \n", "virkelige 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 \n", "Ørkenen 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 \n", "årer 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 \n", "ødela 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 \n", "\n", "[6844 rows x 31 columns]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# Import top 50000 tokens from NB collection\n", "tot = dh.totals(50000)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# Divide corpus freq count by tot\n", "res = df.sum(axis=1) / tot.freq" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "# Get top 1000 more frequent tokens\n", "target_tokens = res.sort_values(ascending=False).iloc[:1000].dropna().index" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789...21222324252627282930
Forstå0.03.03.00.02.00.00.00.00.06.0...3.03.01.00.01.04.00.01.00.00.0
FØR3.02.03.03.04.02.02.02.02.03.0...4.02.02.02.03.03.01.03.01.01.0
vikingene0.00.00.00.00.00.00.00.00.00.0...0.00.00.02.00.00.00.00.00.00.0
læreboka0.00.01.00.01.01.01.00.01.00.0...2.00.00.00.00.00.00.01.00.00.0
18504.02.03.03.04.02.02.02.02.03.0...4.04.02.02.03.03.01.04.01.01.0
..................................................................
fortrinn0.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
hustruen0.00.00.00.00.00.00.00.00.01.0...0.00.00.00.00.00.00.00.00.00.0
liga0.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
skitt0.00.00.00.00.00.00.00.00.00.0...0.00.01.00.00.00.00.00.00.00.0
van-0.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
\n", "

1000 rows × 31 columns

\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8 9 ... 21 22 \\\n", "Forstå 0.0 3.0 3.0 0.0 2.0 0.0 0.0 0.0 0.0 6.0 ... 3.0 3.0 \n", "FØR 3.0 2.0 3.0 3.0 4.0 2.0 2.0 2.0 2.0 3.0 ... 4.0 2.0 \n", "vikingene 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n", "læreboka 0.0 0.0 1.0 0.0 1.0 1.0 1.0 0.0 1.0 0.0 ... 2.0 0.0 \n", "1850 4.0 2.0 3.0 3.0 4.0 2.0 2.0 2.0 2.0 3.0 ... 4.0 4.0 \n", "... ... ... ... ... ... ... ... ... ... ... ... ... ... \n", "fortrinn 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n", "hustruen 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 \n", "liga 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n", "skitt 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n", "van- 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n", "\n", " 23 24 25 26 27 28 29 30 \n", "Forstå 1.0 0.0 1.0 4.0 0.0 1.0 0.0 0.0 \n", "FØR 2.0 2.0 3.0 3.0 1.0 3.0 1.0 1.0 \n", "vikingene 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "læreboka 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 \n", "1850 2.0 2.0 3.0 3.0 1.0 4.0 1.0 1.0 \n", "... ... ... ... ... ... ... ... ... \n", "fortrinn 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "hustruen 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "liga 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "skitt 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "van- 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "\n", "[1000 rows x 31 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.loc[target_tokens]" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "outer_lst = []\n", "\n", "for col in df.loc[target_tokens].columns:\n", " inner_str = \"\"\n", " for i, x in df.loc[target_tokens][col].items():\n", " if x > 0:\n", " inner_str += (i + \" \") * int(x)\n", " \n", " outer_lst.append(inner_str)\n" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "31" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(outer_lst)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Prep for LDA" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "data = [x.split() for x in outer_lst]" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "id2word = gensim.corpora.Dictionary(data)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "corpus = [id2word.doc2bow(chunk) for chunk in data]" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Make model" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "lda_model = gensim.models.LdaMulticore(\n", " corpus = corpus,\n", " id2word = id2word,\n", " num_topics = 10\n", ")" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[(0,\n", " '0.022*\"brenning\" + 0.015*\"1650\" + 0.014*\"faget\" + 0.014*\"1850\" + '\n", " '0.014*\"Hva\" + 0.014*\"FØR\" + 0.011*\"målene\" + 0.011*\"EUROPA\" + 0.010*\"NYE\" + '\n", " '0.010*\"hersker\"'),\n", " (1,\n", " '0.022*\"Hva\" + 0.017*\"1850\" + 0.013*\"garden\" + 0.012*\"Kilde\" + '\n", " '0.011*\"revolusjon\" + 0.011*\"sådde\" + 0.010*\"slags\" + 0.009*\"industrielle\" + '\n", " '0.009*\"skipet\" + 0.008*\"FØR\"'),\n", " (2,\n", " '0.020*\"1850\" + 0.015*\"Hva\" + 0.015*\"FØR\" + 0.014*\"garden\" + '\n", " '0.012*\"Kapittel\" + 0.011*\"Kilde\" + 0.010*\"sådde\" + 0.009*\"Forstå\" + '\n", " '0.009*\"slags\" + 0.008*\"1814\"'),\n", " (3,\n", " '0.013*\"faget\" + 0.009*\"målene\" + 0.007*\"Vurdering\" + 0.006*\"kjennskap\" + '\n", " '0.006*\"menneskene\" + 0.006*\"middels\" + 0.005*\"Lag\" + 0.005*\"læreplanen\" + '\n", " '0.005*\"læreren\" + 0.005*\"informere\"'),\n", " (4,\n", " '0.032*\"Hva\" + 0.025*\"Hvordan\" + 0.023*\"1850\" + 0.020*\"FØR\" + '\n", " '0.016*\"Oppgave\" + 0.016*\"Forstå\" + 0.015*\"Spørsmål\" + 0.014*\"slags\" + '\n", " '0.010*\"Hvilke\" + 0.010*\"perioden\"'),\n", " (5,\n", " '0.031*\"Hva\" + 0.014*\"1850\" + 0.014*\"slags\" + 0.014*\"Hvordan\" + '\n", " '0.014*\"Spørsmål\" + 0.012*\"FØR\" + 0.012*\"skipet\" + 0.010*\"Hvilke\" + '\n", " '0.010*\"Kilde\" + 0.009*\"vikingene\"'),\n", " (6,\n", " '0.022*\"Hvordan\" + 0.021*\"Hva\" + 0.020*\"FØR\" + 0.019*\"1850\" + 0.012*\"1814\" + '\n", " '0.011*\"Guri\" + 0.011*\"Sønsteby\" + 0.010*\"Kilde\" + 0.010*\"Spørsmål\" + '\n", " '0.010*\"slags\"'),\n", " (7,\n", " '0.042*\"Kapittel\" + 0.032*\"Hva\" + 0.022*\"1850\" + 0.012*\"FØR\" + '\n", " '0.012*\"samfunnet\" + 0.012*\"revolusjon\" + 0.011*\"Hvordan\" + 0.010*\"slags\" + '\n", " '0.010*\"læreplanen\" + 0.010*\"faget\"'),\n", " (8,\n", " '0.045*\"Hva\" + 0.028*\"1850\" + 0.026*\"FØR\" + 0.019*\"Hvordan\" + '\n", " '0.019*\"Spørsmål\" + 0.017*\"Kilde\" + 0.016*\"slags\" + 0.016*\"Oppgave\" + '\n", " '0.013*\"Forstå\" + 0.010*\"1814\"'),\n", " (9,\n", " '0.027*\"Danmarks\" + 0.020*\"Kilde\" + 0.013*\"Hva\" + 0.012*\"Hvordan\" + '\n", " '0.011*\"Spørsmål\" + 0.011*\"FØR\" + 0.010*\"rikes\" + 0.010*\"udi\" + 0.009*\"1850\" '\n", " '+ 0.009*\"raad\"')]\n" ] } ], "source": [ "pprint(lda_model.print_topics())" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "prep = genvis.prepare(lda_model, corpus, id2word)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "# Save to html\n", "# pyLDAvis.save_html(prep, \"result.html\")" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "pyLDAvis.enable_notebook()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "
\n", "" ], "text/plain": [ "" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pyLDAvis.display(prep)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" }, "vscode": { "interpreter": { "hash": "1d1df0d064732dbd8ae09ceab87be2790d7eed279040ea9405054873f855fb6c" } } }, "nbformat": 4, "nbformat_minor": 2 }