diff --git a/Makefile b/Makefile index d782b19ed3..29fe984285 100644 --- a/Makefile +++ b/Makefile @@ -29,6 +29,10 @@ filedurations: learningactivities: python contentcuration/manage.py set_default_learning_activities +set-tsvectors: + python contentcuration/manage.py set_channel_tsvectors + python contentcuration/manage.py set_contentnode_tsvectors + ############################################################### # END PRODUCTION COMMANDS ##################################### ############################################################### diff --git a/contentcuration/contentcuration/settings.py b/contentcuration/contentcuration/settings.py index 8284174264..bd857d651b 100644 --- a/contentcuration/contentcuration/settings.py +++ b/contentcuration/contentcuration/settings.py @@ -85,6 +85,7 @@ 'webpack_loader', 'django_filters', 'mathfilters', + 'django.contrib.postgres', 'django_celery_results', ) @@ -220,7 +221,6 @@ IS_CONTENTNODE_TABLE_PARTITIONED = os.getenv("IS_CONTENTNODE_TABLE_PARTITIONED") or False - DATABASE_ROUTERS = [ "kolibri_content.router.ContentDBRouter", ] diff --git a/contentcuration/contentcuration/tests/utils/test_cache.py b/contentcuration/contentcuration/tests/utils/test_cache.py index 1bbf69d580..5327da19ac 100644 --- a/contentcuration/contentcuration/tests/utils/test_cache.py +++ b/contentcuration/contentcuration/tests/utils/test_cache.py @@ -1,12 +1,12 @@ import mock -from django.test import SimpleTestCase +from django.test import TestCase from ..helpers import mock_class_instance from contentcuration.models import ContentNode from contentcuration.utils.cache import ResourceSizeCache -class ResourceSizeCacheTestCase(SimpleTestCase): +class ResourceSizeCacheTestCase(TestCase): def setUp(self): super(ResourceSizeCacheTestCase, self).setUp() self.node = mock.Mock(spec_set=ContentNode()) diff --git a/contentcuration/contentcuration/tests/utils/test_nodes.py b/contentcuration/contentcuration/tests/utils/test_nodes.py index 3b96c30a3c..83171288d6 100644 --- a/contentcuration/contentcuration/tests/utils/test_nodes.py +++ b/contentcuration/contentcuration/tests/utils/test_nodes.py @@ -6,7 +6,7 @@ from dateutil.parser import isoparse from django.db.models import F from django.db.models import Max -from django.test import SimpleTestCase +from django.test import TestCase from ..base import StudioTestCase from contentcuration.models import ContentNode @@ -42,7 +42,7 @@ def test_modified_since(self): @mock.patch("contentcuration.utils.nodes.ResourceSizeHelper") @mock.patch("contentcuration.utils.nodes.ResourceSizeCache") -class CalculateResourceSizeTestCase(SimpleTestCase): +class CalculateResourceSizeTestCase(TestCase): def setUp(self): super(CalculateResourceSizeTestCase, self).setUp() self.node = mock.Mock(spec_set=ContentNode()) diff --git a/contentcuration/search/constants.py b/contentcuration/search/constants.py new file mode 100644 index 0000000000..1ac316c3ae --- /dev/null +++ b/contentcuration/search/constants.py @@ -0,0 +1,16 @@ +from django.contrib.postgres.search import SearchVector + +# Postgres full text search configuration. We use "simple" to make search +# language agnostic. +POSTGRES_FTS_CONFIG = "simple" + +# ContentNode vectors and search fields. +CONTENTNODE_KEYWORDS_TSVECTOR_FIELDS = ("id", "channel_id", "node_id", "content_id", "tree_id", "title", "description", "contentnode_tags") +CONTENTNODE_KEYWORDS_TSVECTOR = SearchVector(*CONTENTNODE_KEYWORDS_TSVECTOR_FIELDS, config=POSTGRES_FTS_CONFIG) + +CONTENTNODE_AUTHOR_TSVECTOR_FIELDS = ("author", "aggregator", "provider") +CONTENTNODE_AUTHOR_TSVECTOR = SearchVector(*CONTENTNODE_AUTHOR_TSVECTOR_FIELDS, config=POSTGRES_FTS_CONFIG) + +# Channel vector and search fields. +CHANNEL_KEYWORDS_TSVECTOR_FIELDS = ("id", "main_tree__tree_id", "name", "description", "tagline", "primary_channel_token") +CHANNEL_KEYWORDS_TSVECTOR = SearchVector(*CHANNEL_KEYWORDS_TSVECTOR_FIELDS, config=POSTGRES_FTS_CONFIG) diff --git a/contentcuration/search/management/__init__.py b/contentcuration/search/management/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/contentcuration/search/management/commands/__init__.py b/contentcuration/search/management/commands/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/contentcuration/search/management/commands/set_channel_tsvectors.py b/contentcuration/search/management/commands/set_channel_tsvectors.py new file mode 100644 index 0000000000..d82f4848f5 --- /dev/null +++ b/contentcuration/search/management/commands/set_channel_tsvectors.py @@ -0,0 +1,57 @@ +""" +This command inserts in bulk channel tsvectors to the ChannelFullTextSearch table. +""" +import logging as logmodule +import time + +from django.core.management.base import BaseCommand +from django.db.models import Exists +from django.db.models import OuterRef +from search.constants import CHANNEL_KEYWORDS_TSVECTOR +from search.models import ChannelFullTextSearch + +from contentcuration.models import Channel +from contentcuration.viewsets.channel import primary_token_subquery + + +logmodule.basicConfig(level=logmodule.INFO) +logging = logmodule.getLogger("command") + +CHUNKSIZE = 5000 + + +class Command(BaseCommand): + + def handle(self, *args, **options): + start = time.time() + + channel_not_already_inserted_query = ~Exists(ChannelFullTextSearch.objects.filter(channel_id=OuterRef("id"))) + + channel_query = (Channel.objects.filter(channel_not_already_inserted_query, + deleted=False, main_tree__published=True) + .annotate(primary_channel_token=primary_token_subquery, + keywords_tsvector=CHANNEL_KEYWORDS_TSVECTOR) + .values("id", "keywords_tsvector")) + + insertable_channels = list(channel_query[:CHUNKSIZE]) + total_channel_tsvectors_inserted = 0 + + while insertable_channels: + logging.info("Inserting channel tsvectors.") + + insert_objs = list() + for channel in insertable_channels: + obj = ChannelFullTextSearch(channel_id=channel["id"], keywords_tsvector=channel["keywords_tsvector"]) + insert_objs.append(obj) + + inserted_objs_list = ChannelFullTextSearch.objects.bulk_create(insert_objs) + + current_inserts_count = len(inserted_objs_list) + total_channel_tsvectors_inserted = total_channel_tsvectors_inserted + current_inserts_count + + logging.info("Inserted {} channel tsvectors.".format(current_inserts_count)) + + insertable_channels = list(channel_query[:CHUNKSIZE]) + + logging.info("Completed! successfully inserted total of {} channel tsvectors in {} seconds.".format( + total_channel_tsvectors_inserted, time.time() - start)) diff --git a/contentcuration/search/management/commands/set_contentnode_tsvectors.py b/contentcuration/search/management/commands/set_contentnode_tsvectors.py new file mode 100644 index 0000000000..4e5673d9ec --- /dev/null +++ b/contentcuration/search/management/commands/set_contentnode_tsvectors.py @@ -0,0 +1,59 @@ +""" +This command inserts in bulk contentnode tsvectors to the ContentNodeFullTextSearch table. +""" +import logging as logmodule +import time + +from django.contrib.postgres.aggregates import StringAgg +from django.core.management.base import BaseCommand +from django.db.models import Exists +from django.db.models import OuterRef +from search.constants import CONTENTNODE_AUTHOR_TSVECTOR +from search.constants import CONTENTNODE_KEYWORDS_TSVECTOR +from search.models import ContentNodeFullTextSearch + +from contentcuration.models import ContentNode + + +logmodule.basicConfig(level=logmodule.INFO) +logging = logmodule.getLogger("command") + +CHUNKSIZE = 10000 + + +class Command(BaseCommand): + + def handle(self, *args, **options): + start = time.time() + + tsvector_not_already_inserted_query = ~Exists(ContentNodeFullTextSearch.objects.filter(contentnode_id=OuterRef("id"))) + + tsvector_node_query = (ContentNode._annotate_channel_id(ContentNode.objects) + .annotate(contentnode_tags=StringAgg("tags__tag_name", delimiter=" "), + keywords_tsvector=CONTENTNODE_KEYWORDS_TSVECTOR, + author_tsvector=CONTENTNODE_AUTHOR_TSVECTOR) + .filter(tsvector_not_already_inserted_query, published=True, channel_id__isnull=False) + .values("id", "channel_id", "keywords_tsvector", "author_tsvector").order_by()) + + insertable_nodes_tsvector = list(tsvector_node_query[:CHUNKSIZE]) + total_tsvectors_inserted = 0 + + while insertable_nodes_tsvector: + logging.info("Inserting contentnode tsvectors.") + + insert_objs = list() + for node in insertable_nodes_tsvector: + obj = ContentNodeFullTextSearch(contentnode_id=node["id"], channel_id=node["channel_id"], + keywords_tsvector=node["keywords_tsvector"], author_tsvector=node["author_tsvector"]) + insert_objs.append(obj) + + inserted_objs_list = ContentNodeFullTextSearch.objects.bulk_create(insert_objs) + + current_inserts_count = len(inserted_objs_list) + total_tsvectors_inserted = total_tsvectors_inserted + current_inserts_count + + logging.info("Inserted {} contentnode tsvectors.".format(current_inserts_count)) + + insertable_nodes_tsvector = list(tsvector_node_query[:CHUNKSIZE]) + + logging.info("Completed! Successfully inserted total of {} contentnode tsvectors in {} seconds.".format(total_tsvectors_inserted, time.time() - start)) diff --git a/contentcuration/search/migrations/0003_fulltextsearch.py b/contentcuration/search/migrations/0003_fulltextsearch.py new file mode 100644 index 0000000000..632df6a39e --- /dev/null +++ b/contentcuration/search/migrations/0003_fulltextsearch.py @@ -0,0 +1,54 @@ +# Generated by Django 3.2.14 on 2022-09-16 08:55 +import uuid + +import django.contrib.postgres.indexes +import django.contrib.postgres.search +import django.db.models.deletion +from django.contrib.postgres.operations import AddIndexConcurrently +from django.db import migrations +from django.db import models + +import contentcuration.models + + +class Migration(migrations.Migration): + + atomic = False + + dependencies = [ + ('contentcuration', '0140_delete_task'), + ('search', '0002_auto_20201215_2110'), + ] + + operations = [ + migrations.CreateModel( + name='ContentNodeFullTextSearch', + fields=[ + ('id', contentcuration.models.UUIDField(default=uuid.uuid4, max_length=32, primary_key=True, serialize=False)), + ('keywords_tsvector', django.contrib.postgres.search.SearchVectorField(blank=True, null=True)), + ('author_tsvector', django.contrib.postgres.search.SearchVectorField(blank=True, null=True)), + ('channel', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='channel_nodes_fts', to='contentcuration.channel')), + ('contentnode', models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, related_name='node_fts', to='contentcuration.contentnode')), + ], + ), + migrations.CreateModel( + name='ChannelFullTextSearch', + fields=[ + ('id', contentcuration.models.UUIDField(default=uuid.uuid4, max_length=32, primary_key=True, serialize=False)), + ('keywords_tsvector', django.contrib.postgres.search.SearchVectorField(blank=True, null=True)), + ('channel', models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, related_name='channel_fts', to='contentcuration.channel')), + ], + ), + AddIndexConcurrently( + model_name='contentnodefulltextsearch', + index=django.contrib.postgres.indexes.GinIndex(fields=['keywords_tsvector'], name='node_keywords_tsv__gin_idx'), + ), + AddIndexConcurrently( + model_name='contentnodefulltextsearch', + index=django.contrib.postgres.indexes.GinIndex(fields=['author_tsvector'], name='node_author_tsv__gin_idx'), + ), + AddIndexConcurrently( + model_name='channelfulltextsearch', + index=django.contrib.postgres.indexes.GinIndex(fields=['keywords_tsvector'], name='channel_keywords_tsv__gin_idx'), + ), + ] diff --git a/contentcuration/search/models.py b/contentcuration/search/models.py index e1e550576b..9e121af509 100644 --- a/contentcuration/search/models.py +++ b/contentcuration/search/models.py @@ -1,8 +1,14 @@ import uuid from django.conf import settings +from django.contrib.postgres.indexes import GinIndex +from django.contrib.postgres.search import SearchVectorField from django.db import models +from contentcuration.models import Channel +from contentcuration.models import ContentNode +from contentcuration.models import UUIDField as StudioUUIDField + class SavedSearch(models.Model): id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) @@ -13,3 +19,37 @@ class SavedSearch(models.Model): saved_by = models.ForeignKey( settings.AUTH_USER_MODEL, related_name="searches", on_delete=models.CASCADE ) + + +class ContentNodeFullTextSearch(models.Model): + id = StudioUUIDField(primary_key=True, default=uuid.uuid4) + + # The contentnode that this record points to. + contentnode = models.OneToOneField(ContentNode, on_delete=models.CASCADE, related_name="node_fts") + + # The channel to which the contentnode belongs. Channel cannot be NULL because we only allow + # searches to be made inside channels. + channel = models.ForeignKey(Channel, on_delete=models.CASCADE, related_name="channel_nodes_fts") + + # This stores the keywords as tsvector. + keywords_tsvector = SearchVectorField(null=True, blank=True) + + # This stores the author as tsvector. + author_tsvector = SearchVectorField(null=True, blank=True) + + class Meta: + indexes = [GinIndex(fields=["keywords_tsvector"], name="node_keywords_tsv__gin_idx"), + GinIndex(fields=["author_tsvector"], name="node_author_tsv__gin_idx")] + + +class ChannelFullTextSearch(models.Model): + id = StudioUUIDField(primary_key=True, default=uuid.uuid4) + + # The channel to which this record points. + channel = models.OneToOneField(Channel, on_delete=models.CASCADE, related_name="channel_fts") + + # This stores the channel keywords as tsvector for super fast searches. + keywords_tsvector = SearchVectorField(null=True, blank=True) + + class Meta: + indexes = [GinIndex(fields=["keywords_tsvector"], name="channel_keywords_tsv__gin_idx")]