{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Topic modelling med gensim og dhlab" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import dhlab as dh\n", "import pandas as pd\n", "import gensim\n", "import pyLDAvis\n", "from pprint import pprint\n", "import pyLDAvis.gensim_models as genvis" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# sprit urn URN:NBN:no-nb_digibok_2020090207537" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Finn en bok" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "bok = dh.Corpus(doctype=\"digibok\", limit=1, title=\"Norgeshistorie\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | dhlabid | \n", "urn | \n", "title | \n", "authors | \n", "oaiid | \n", "sesamid | \n", "isbn10 | \n", "city | \n", "timestamp | \n", "year | \n", "publisher | \n", "langs | \n", "subjects | \n", "ddc | \n", "genres | \n", "literaryform | \n", "doctype | \n", "ocr_creator | \n", "ocr_timestamp | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "100441759 | \n", "URN:NBN:no-nb_digibok_2009111200018 | \n", "Elevens ressursbok : verdens- og Norgeshistori... | \n", "Berger , Finn / Henningsen , Rune / Aass , Ole | \n", "oai:nb.bibsys.no:999721559964702202 | \n", "9d569a267470cda9851b718c9bdd20f0 | \n", "8200423387 | \n", "\n", " | 19970101 | \n", "1997 | \n", "Universitetsforl. | \n", "nob | \n", "Eldre tid / Tidlig nytid / Historie / Verden /... | \n", "\n", " | \n", " | Faglitteratur | \n", "digibok | \n", "nb | \n", "20060101 | \n", "
\n", " | 0 | \n", "1 | \n", "2 | \n", "3 | \n", "4 | \n", "5 | \n", "6 | \n", "7 | \n", "8 | \n", "9 | \n", "... | \n", "21 | \n", "22 | \n", "23 | \n", "24 | \n", "25 | \n", "26 | \n", "27 | \n", "28 | \n", "29 | \n", "30 | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
( | \n", "8.0 | \n", "0.0 | \n", "4.0 | \n", "1.0 | \n", "4.0 | \n", "4.0 | \n", "2.0 | \n", "4.0 | \n", "11.0 | \n", "4.0 | \n", "... | \n", "2.0 | \n", "3.0 | \n", "2.0 | \n", "6.0 | \n", "4.0 | \n", "1.0 | \n", "5.0 | \n", "3.0 | \n", "2.0 | \n", "1.0 | \n", "
) | \n", "8.0 | \n", "0.0 | \n", "4.0 | \n", "1.0 | \n", "4.0 | \n", "4.0 | \n", "2.0 | \n", "4.0 | \n", "11.0 | \n", "4.0 | \n", "... | \n", "2.0 | \n", "3.0 | \n", "2.0 | \n", "5.0 | \n", "5.0 | \n", "1.0 | \n", "5.0 | \n", "3.0 | \n", "2.0 | \n", "1.0 | \n", "
, | \n", "49.0 | \n", "36.0 | \n", "44.0 | \n", "60.0 | \n", "43.0 | \n", "39.0 | \n", "55.0 | \n", "42.0 | \n", "20.0 | \n", "55.0 | \n", "... | \n", "38.0 | \n", "39.0 | \n", "40.0 | \n", "45.0 | \n", "58.0 | \n", "38.0 | \n", "32.0 | \n", "41.0 | \n", "42.0 | \n", "38.0 | \n", "
- | \n", "8.0 | \n", "5.0 | \n", "2.0 | \n", "5.0 | \n", "4.0 | \n", "3.0 | \n", "3.0 | \n", "8.0 | \n", "3.0 | \n", "6.0 | \n", "... | \n", "4.0 | \n", "10.0 | \n", "9.0 | \n", "1.0 | \n", "2.0 | \n", "1.0 | \n", "5.0 | \n", "20.0 | \n", "3.0 | \n", "0.0 | \n", "
. | \n", "22.0 | \n", "38.0 | \n", "25.0 | \n", "27.0 | \n", "25.0 | \n", "25.0 | \n", "37.0 | \n", "24.0 | \n", "32.0 | \n", "36.0 | \n", "... | \n", "27.0 | \n", "46.0 | \n", "32.0 | \n", "41.0 | \n", "17.0 | \n", "38.0 | \n", "36.0 | \n", "31.0 | \n", "41.0 | \n", "40.0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
verdighet | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "
virkelige | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "
Ørkenen | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "
årer | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "
ødela | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "
6844 rows × 31 columns
\n", "\n", " | 0 | \n", "1 | \n", "2 | \n", "3 | \n", "4 | \n", "5 | \n", "6 | \n", "7 | \n", "8 | \n", "9 | \n", "... | \n", "21 | \n", "22 | \n", "23 | \n", "24 | \n", "25 | \n", "26 | \n", "27 | \n", "28 | \n", "29 | \n", "30 | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Forstå | \n", "0.0 | \n", "3.0 | \n", "3.0 | \n", "0.0 | \n", "2.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "6.0 | \n", "... | \n", "3.0 | \n", "3.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "4.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "
FØR | \n", "3.0 | \n", "2.0 | \n", "3.0 | \n", "3.0 | \n", "4.0 | \n", "2.0 | \n", "2.0 | \n", "2.0 | \n", "2.0 | \n", "3.0 | \n", "... | \n", "4.0 | \n", "2.0 | \n", "2.0 | \n", "2.0 | \n", "3.0 | \n", "3.0 | \n", "1.0 | \n", "3.0 | \n", "1.0 | \n", "1.0 | \n", "
vikingene | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "2.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
læreboka | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "... | \n", "2.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "
1850 | \n", "4.0 | \n", "2.0 | \n", "3.0 | \n", "3.0 | \n", "4.0 | \n", "2.0 | \n", "2.0 | \n", "2.0 | \n", "2.0 | \n", "3.0 | \n", "... | \n", "4.0 | \n", "4.0 | \n", "2.0 | \n", "2.0 | \n", "3.0 | \n", "3.0 | \n", "1.0 | \n", "4.0 | \n", "1.0 | \n", "1.0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
fortrinn | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
hustruen | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
liga | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
skitt | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
van- | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
1000 rows × 31 columns
\n", "