learningequality · bjester · Sep 16, 2022 · Sep 16, 2022
diff --git a/Makefile b/Makefile
@@ -29,6 +29,10 @@ filedurations:
 learningactivities:
 	python contentcuration/manage.py set_default_learning_activities
 
+set-tsvectors:
+	python contentcuration/manage.py set_channel_tsvectors
+	python contentcuration/manage.py set_contentnode_tsvectors
+
 ###############################################################
 # END PRODUCTION COMMANDS #####################################
 ###############################################################

diff --git a/contentcuration/contentcuration/settings.py b/contentcuration/contentcuration/settings.py
@@ -85,6 +85,7 @@
     'webpack_loader',
     'django_filters',
     'mathfilters',
+    'django.contrib.postgres',
     'django_celery_results',
 )
 
@@ -220,7 +221,6 @@
 
 IS_CONTENTNODE_TABLE_PARTITIONED = os.getenv("IS_CONTENTNODE_TABLE_PARTITIONED") or False
 
-
 DATABASE_ROUTERS = [
     "kolibri_content.router.ContentDBRouter",
 ]

diff --git a/contentcuration/contentcuration/tests/utils/test_cache.py b/contentcuration/contentcuration/tests/utils/test_cache.py
@@ -1,12 +1,12 @@
 import mock
-from django.test import SimpleTestCase
+from django.test import TestCase
 
 from ..helpers import mock_class_instance
 from contentcuration.models import ContentNode
 from contentcuration.utils.cache import ResourceSizeCache
 
 
-class ResourceSizeCacheTestCase(SimpleTestCase):
+class ResourceSizeCacheTestCase(TestCase):
     def setUp(self):
         super(ResourceSizeCacheTestCase, self).setUp()
         self.node = mock.Mock(spec_set=ContentNode())

diff --git a/contentcuration/contentcuration/tests/utils/test_nodes.py b/contentcuration/contentcuration/tests/utils/test_nodes.py
@@ -6,7 +6,7 @@
 from dateutil.parser import isoparse
 from django.db.models import F
 from django.db.models import Max
-from django.test import SimpleTestCase
+from django.test import TestCase
 
 from ..base import StudioTestCase
 from contentcuration.models import ContentNode
@@ -42,7 +42,7 @@ def test_modified_since(self):
 
 @mock.patch("contentcuration.utils.nodes.ResourceSizeHelper")
 @mock.patch("contentcuration.utils.nodes.ResourceSizeCache")
-class CalculateResourceSizeTestCase(SimpleTestCase):
+class CalculateResourceSizeTestCase(TestCase):
     def setUp(self):
         super(CalculateResourceSizeTestCase, self).setUp()
         self.node = mock.Mock(spec_set=ContentNode())

diff --git a/contentcuration/search/constants.py b/contentcuration/search/constants.py
@@ -0,0 +1,16 @@
+from django.contrib.postgres.search import SearchVector
+
+# Postgres full text search configuration. We use "simple" to make search
+# language agnostic.
+POSTGRES_FTS_CONFIG = "simple"
+
+# ContentNode vectors and search fields.
+CONTENTNODE_KEYWORDS_TSVECTOR_FIELDS = ("id", "channel_id", "node_id", "content_id", "tree_id", "title", "description", "contentnode_tags")
+CONTENTNODE_KEYWORDS_TSVECTOR = SearchVector(*CONTENTNODE_KEYWORDS_TSVECTOR_FIELDS, config=POSTGRES_FTS_CONFIG)
+
+CONTENTNODE_AUTHOR_TSVECTOR_FIELDS = ("author", "aggregator", "provider")
+CONTENTNODE_AUTHOR_TSVECTOR = SearchVector(*CONTENTNODE_AUTHOR_TSVECTOR_FIELDS, config=POSTGRES_FTS_CONFIG)
+
+# Channel vector and search fields.
+CHANNEL_KEYWORDS_TSVECTOR_FIELDS = ("id", "main_tree__tree_id", "name", "description", "tagline", "primary_channel_token")
+CHANNEL_KEYWORDS_TSVECTOR = SearchVector(*CHANNEL_KEYWORDS_TSVECTOR_FIELDS, config=POSTGRES_FTS_CONFIG)
diff --git a/contentcuration/search/management/__init__.py b/contentcuration/search/management/__init__.py
diff --git a/contentcuration/search/management/commands/__init__.py b/contentcuration/search/management/commands/__init__.py
diff --git a/contentcuration/search/management/commands/set_channel_tsvectors.py b/contentcuration/search/management/commands/set_channel_tsvectors.py
@@ -0,0 +1,57 @@
+"""
+This command inserts in bulk channel tsvectors to the ChannelFullTextSearch table.
+"""
+import logging as logmodule
+import time
+
+from django.core.management.base import BaseCommand
+from django.db.models import Exists
+from django.db.models import OuterRef
+from search.constants import CHANNEL_KEYWORDS_TSVECTOR
+from search.models import ChannelFullTextSearch
+
+from contentcuration.models import Channel
+from contentcuration.viewsets.channel import primary_token_subquery
+
+
+logmodule.basicConfig(level=logmodule.INFO)
+logging = logmodule.getLogger("command")
+
+CHUNKSIZE = 5000
+
+
+class Command(BaseCommand):
+
+    def handle(self, *args, **options):
+        start = time.time()
+
+        channel_not_already_inserted_query = ~Exists(ChannelFullTextSearch.objects.filter(channel_id=OuterRef("id")))
+
+        channel_query = (Channel.objects.filter(channel_not_already_inserted_query,
+                                                deleted=False, main_tree__published=True)
+                         .annotate(primary_channel_token=primary_token_subquery,
+                                   keywords_tsvector=CHANNEL_KEYWORDS_TSVECTOR)
+                         .values("id", "keywords_tsvector"))
+
+        insertable_channels = list(channel_query[:CHUNKSIZE])
+        total_channel_tsvectors_inserted = 0
+
+        while insertable_channels:
+            logging.info("Inserting channel tsvectors.")
+
+            insert_objs = list()
+            for channel in insertable_channels:
+                obj = ChannelFullTextSearch(channel_id=channel["id"], keywords_tsvector=channel["keywords_tsvector"])
+                insert_objs.append(obj)
+
+            inserted_objs_list = ChannelFullTextSearch.objects.bulk_create(insert_objs)
+
+            current_inserts_count = len(inserted_objs_list)
+            total_channel_tsvectors_inserted = total_channel_tsvectors_inserted + current_inserts_count
+
+            logging.info("Inserted {} channel tsvectors.".format(current_inserts_count))
+
+            insertable_channels = list(channel_query[:CHUNKSIZE])
+
+        logging.info("Completed! successfully inserted total of {} channel tsvectors in {} seconds.".format(
+            total_channel_tsvectors_inserted, time.time() - start))
diff --git a/contentcuration/search/management/commands/set_contentnode_tsvectors.py b/contentcuration/search/management/commands/set_contentnode_tsvectors.py
@@ -0,0 +1,59 @@
+"""
+This command inserts in bulk contentnode tsvectors to the ContentNodeFullTextSearch table.
+"""
+import logging as logmodule
+import time
+
+from django.contrib.postgres.aggregates import StringAgg
+from django.core.management.base import BaseCommand
+from django.db.models import Exists
+from django.db.models import OuterRef
+from search.constants import CONTENTNODE_AUTHOR_TSVECTOR
+from search.constants import CONTENTNODE_KEYWORDS_TSVECTOR
+from search.models import ContentNodeFullTextSearch
+
+from contentcuration.models import ContentNode
+
+
+logmodule.basicConfig(level=logmodule.INFO)
+logging = logmodule.getLogger("command")
+
+CHUNKSIZE = 10000
+
+
+class Command(BaseCommand):
+
+    def handle(self, *args, **options):
+        start = time.time()
+
+        tsvector_not_already_inserted_query = ~Exists(ContentNodeFullTextSearch.objects.filter(contentnode_id=OuterRef("id")))
+
+        tsvector_node_query = (ContentNode._annotate_channel_id(ContentNode.objects)
+                               .annotate(contentnode_tags=StringAgg("tags__tag_name", delimiter=" "),
+                                         keywords_tsvector=CONTENTNODE_KEYWORDS_TSVECTOR,
+                                         author_tsvector=CONTENTNODE_AUTHOR_TSVECTOR)
+                               .filter(tsvector_not_already_inserted_query, published=True, channel_id__isnull=False)
+                               .values("id", "channel_id", "keywords_tsvector", "author_tsvector").order_by())
+
+        insertable_nodes_tsvector = list(tsvector_node_query[:CHUNKSIZE])
+        total_tsvectors_inserted = 0
+
+        while insertable_nodes_tsvector:
+            logging.info("Inserting contentnode tsvectors.")
+
+            insert_objs = list()
+            for node in insertable_nodes_tsvector:
+                obj = ContentNodeFullTextSearch(contentnode_id=node["id"], channel_id=node["channel_id"],
+                                                keywords_tsvector=node["keywords_tsvector"], author_tsvector=node["author_tsvector"])
+                insert_objs.append(obj)
+
+            inserted_objs_list = ContentNodeFullTextSearch.objects.bulk_create(insert_objs)
+
+            current_inserts_count = len(inserted_objs_list)
+            total_tsvectors_inserted = total_tsvectors_inserted + current_inserts_count
+
+            logging.info("Inserted {} contentnode tsvectors.".format(current_inserts_count))
+
+            insertable_nodes_tsvector = list(tsvector_node_query[:CHUNKSIZE])
+
+        logging.info("Completed! Successfully inserted total of {} contentnode tsvectors in {} seconds.".format(total_tsvectors_inserted, time.time() - start))
diff --git a/contentcuration/search/migrations/0003_fulltextsearch.py b/contentcuration/search/migrations/0003_fulltextsearch.py
@@ -0,0 +1,54 @@
+# Generated by Django 3.2.14 on 2022-09-16 08:55
+import uuid
+
+import django.contrib.postgres.indexes
+import django.contrib.postgres.search
+import django.db.models.deletion
+from django.contrib.postgres.operations import AddIndexConcurrently
+from django.db import migrations
+from django.db import models
+
+import contentcuration.models
+
+
+class Migration(migrations.Migration):
+
+    atomic = False
+
+    dependencies = [
+        ('contentcuration', '0140_delete_task'),
+        ('search', '0002_auto_20201215_2110'),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='ContentNodeFullTextSearch',
+            fields=[
+                ('id', contentcuration.models.UUIDField(default=uuid.uuid4, max_length=32, primary_key=True, serialize=False)),
+                ('keywords_tsvector', django.contrib.postgres.search.SearchVectorField(blank=True, null=True)),
+                ('author_tsvector', django.contrib.postgres.search.SearchVectorField(blank=True, null=True)),
+                ('channel', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='channel_nodes_fts', to='contentcuration.channel')),
+                ('contentnode', models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, related_name='node_fts', to='contentcuration.contentnode')),
+            ],
+        ),
+        migrations.CreateModel(
+            name='ChannelFullTextSearch',
+            fields=[
+                ('id', contentcuration.models.UUIDField(default=uuid.uuid4, max_length=32, primary_key=True, serialize=False)),
+                ('keywords_tsvector', django.contrib.postgres.search.SearchVectorField(blank=True, null=True)),
+                ('channel', models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, related_name='channel_fts', to='contentcuration.channel')),
+            ],
+        ),
+        AddIndexConcurrently(
+            model_name='contentnodefulltextsearch',
+            index=django.contrib.postgres.indexes.GinIndex(fields=['keywords_tsvector'], name='node_keywords_tsv__gin_idx'),
+        ),
+        AddIndexConcurrently(
+            model_name='contentnodefulltextsearch',
+            index=django.contrib.postgres.indexes.GinIndex(fields=['author_tsvector'], name='node_author_tsv__gin_idx'),
+        ),
+        AddIndexConcurrently(
+            model_name='channelfulltextsearch',
+            index=django.contrib.postgres.indexes.GinIndex(fields=['keywords_tsvector'], name='channel_keywords_tsv__gin_idx'),
+        ),
+    ]
diff --git a/contentcuration/search/models.py b/contentcuration/search/models.py
@@ -1,8 +1,14 @@
 import uuid
 
 from django.conf import settings
+from django.contrib.postgres.indexes import GinIndex
+from django.contrib.postgres.search import SearchVectorField
 from django.db import models
 
+from contentcuration.models import Channel
+from contentcuration.models import ContentNode
+from contentcuration.models import UUIDField as StudioUUIDField
+
 
 class SavedSearch(models.Model):
     id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
@@ -13,3 +19,37 @@ class SavedSearch(models.Model):
     saved_by = models.ForeignKey(
         settings.AUTH_USER_MODEL, related_name="searches", on_delete=models.CASCADE
     )
+
+
+class ContentNodeFullTextSearch(models.Model):
+    id = StudioUUIDField(primary_key=True, default=uuid.uuid4)
+
+    # The contentnode that this record points to.
+    contentnode = models.OneToOneField(ContentNode, on_delete=models.CASCADE, related_name="node_fts")
+
+    # The channel to which the contentnode belongs. Channel cannot be NULL because we only allow
+    # searches to be made inside channels.
+    channel = models.ForeignKey(Channel, on_delete=models.CASCADE, related_name="channel_nodes_fts")
+
+    # This stores the keywords as tsvector.
+    keywords_tsvector = SearchVectorField(null=True, blank=True)
+
+    # This stores the author as tsvector.
+    author_tsvector = SearchVectorField(null=True, blank=True)
+
+    class Meta:
+        indexes = [GinIndex(fields=["keywords_tsvector"], name="node_keywords_tsv__gin_idx"),
+                   GinIndex(fields=["author_tsvector"], name="node_author_tsv__gin_idx")]
+
+
+class ChannelFullTextSearch(models.Model):
+    id = StudioUUIDField(primary_key=True, default=uuid.uuid4)
+
+    # The channel to which this record points.
+    channel = models.OneToOneField(Channel, on_delete=models.CASCADE, related_name="channel_fts")
+
+    # This stores the channel keywords as tsvector for super fast searches.
+    keywords_tsvector = SearchVectorField(null=True, blank=True)
+
+    class Meta:
+        indexes = [GinIndex(fields=["keywords_tsvector"], name="channel_keywords_tsv__gin_idx")]