Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ filedurations:
learningactivities:
python contentcuration/manage.py set_default_learning_activities

set-tsvectors:
python contentcuration/manage.py set_channel_tsvectors
python contentcuration/manage.py set_contentnode_tsvectors

###############################################################
# END PRODUCTION COMMANDS #####################################
###############################################################
Expand Down
2 changes: 1 addition & 1 deletion contentcuration/contentcuration/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@
'webpack_loader',
'django_filters',
'mathfilters',
'django.contrib.postgres',
'django_celery_results',
)

Expand Down Expand Up @@ -220,7 +221,6 @@

IS_CONTENTNODE_TABLE_PARTITIONED = os.getenv("IS_CONTENTNODE_TABLE_PARTITIONED") or False


DATABASE_ROUTERS = [
"kolibri_content.router.ContentDBRouter",
]
Expand Down
4 changes: 2 additions & 2 deletions contentcuration/contentcuration/tests/utils/test_cache.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import mock
from django.test import SimpleTestCase
from django.test import TestCase

from ..helpers import mock_class_instance
from contentcuration.models import ContentNode
from contentcuration.utils.cache import ResourceSizeCache


class ResourceSizeCacheTestCase(SimpleTestCase):
class ResourceSizeCacheTestCase(TestCase):
Comment thread
bjester marked this conversation as resolved.
def setUp(self):
super(ResourceSizeCacheTestCase, self).setUp()
self.node = mock.Mock(spec_set=ContentNode())
Expand Down
4 changes: 2 additions & 2 deletions contentcuration/contentcuration/tests/utils/test_nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from dateutil.parser import isoparse
from django.db.models import F
from django.db.models import Max
from django.test import SimpleTestCase
from django.test import TestCase

from ..base import StudioTestCase
from contentcuration.models import ContentNode
Expand Down Expand Up @@ -42,7 +42,7 @@ def test_modified_since(self):

@mock.patch("contentcuration.utils.nodes.ResourceSizeHelper")
@mock.patch("contentcuration.utils.nodes.ResourceSizeCache")
class CalculateResourceSizeTestCase(SimpleTestCase):
class CalculateResourceSizeTestCase(TestCase):
def setUp(self):
super(CalculateResourceSizeTestCase, self).setUp()
self.node = mock.Mock(spec_set=ContentNode())
Expand Down
16 changes: 16 additions & 0 deletions contentcuration/search/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from django.contrib.postgres.search import SearchVector

# Postgres full text search configuration. We use "simple" to make search
# language agnostic.
POSTGRES_FTS_CONFIG = "simple"

# ContentNode vectors and search fields.
CONTENTNODE_KEYWORDS_TSVECTOR_FIELDS = ("id", "channel_id", "node_id", "content_id", "tree_id", "title", "description", "contentnode_tags")
CONTENTNODE_KEYWORDS_TSVECTOR = SearchVector(*CONTENTNODE_KEYWORDS_TSVECTOR_FIELDS, config=POSTGRES_FTS_CONFIG)

CONTENTNODE_AUTHOR_TSVECTOR_FIELDS = ("author", "aggregator", "provider")
CONTENTNODE_AUTHOR_TSVECTOR = SearchVector(*CONTENTNODE_AUTHOR_TSVECTOR_FIELDS, config=POSTGRES_FTS_CONFIG)

# Channel vector and search fields.
CHANNEL_KEYWORDS_TSVECTOR_FIELDS = ("id", "main_tree__tree_id", "name", "description", "tagline", "primary_channel_token")
CHANNEL_KEYWORDS_TSVECTOR = SearchVector(*CHANNEL_KEYWORDS_TSVECTOR_FIELDS, config=POSTGRES_FTS_CONFIG)
Empty file.
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
"""
This command inserts in bulk channel tsvectors to the ChannelFullTextSearch table.
"""
import logging as logmodule
import time

from django.core.management.base import BaseCommand
from django.db.models import Exists
from django.db.models import OuterRef
from search.constants import CHANNEL_KEYWORDS_TSVECTOR
from search.models import ChannelFullTextSearch

from contentcuration.models import Channel
from contentcuration.viewsets.channel import primary_token_subquery


logmodule.basicConfig(level=logmodule.INFO)
logging = logmodule.getLogger("command")

CHUNKSIZE = 5000


class Command(BaseCommand):

def handle(self, *args, **options):
start = time.time()

channel_not_already_inserted_query = ~Exists(ChannelFullTextSearch.objects.filter(channel_id=OuterRef("id")))

channel_query = (Channel.objects.filter(channel_not_already_inserted_query,
deleted=False, main_tree__published=True)
.annotate(primary_channel_token=primary_token_subquery,
keywords_tsvector=CHANNEL_KEYWORDS_TSVECTOR)
.values("id", "keywords_tsvector"))

insertable_channels = list(channel_query[:CHUNKSIZE])
total_channel_tsvectors_inserted = 0

while insertable_channels:
logging.info("Inserting channel tsvectors.")

insert_objs = list()
for channel in insertable_channels:
obj = ChannelFullTextSearch(channel_id=channel["id"], keywords_tsvector=channel["keywords_tsvector"])
insert_objs.append(obj)

inserted_objs_list = ChannelFullTextSearch.objects.bulk_create(insert_objs)

current_inserts_count = len(inserted_objs_list)
total_channel_tsvectors_inserted = total_channel_tsvectors_inserted + current_inserts_count

logging.info("Inserted {} channel tsvectors.".format(current_inserts_count))

insertable_channels = list(channel_query[:CHUNKSIZE])

logging.info("Completed! successfully inserted total of {} channel tsvectors in {} seconds.".format(
total_channel_tsvectors_inserted, time.time() - start))
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""
This command inserts in bulk contentnode tsvectors to the ContentNodeFullTextSearch table.
"""
import logging as logmodule
import time

from django.contrib.postgres.aggregates import StringAgg
from django.core.management.base import BaseCommand
from django.db.models import Exists
from django.db.models import OuterRef
from search.constants import CONTENTNODE_AUTHOR_TSVECTOR
from search.constants import CONTENTNODE_KEYWORDS_TSVECTOR
from search.models import ContentNodeFullTextSearch

from contentcuration.models import ContentNode


logmodule.basicConfig(level=logmodule.INFO)
logging = logmodule.getLogger("command")

CHUNKSIZE = 10000


class Command(BaseCommand):

def handle(self, *args, **options):
start = time.time()

tsvector_not_already_inserted_query = ~Exists(ContentNodeFullTextSearch.objects.filter(contentnode_id=OuterRef("id")))

tsvector_node_query = (ContentNode._annotate_channel_id(ContentNode.objects)
.annotate(contentnode_tags=StringAgg("tags__tag_name", delimiter=" "),
keywords_tsvector=CONTENTNODE_KEYWORDS_TSVECTOR,
author_tsvector=CONTENTNODE_AUTHOR_TSVECTOR)
.filter(tsvector_not_already_inserted_query, published=True, channel_id__isnull=False)
.values("id", "channel_id", "keywords_tsvector", "author_tsvector").order_by())

insertable_nodes_tsvector = list(tsvector_node_query[:CHUNKSIZE])
total_tsvectors_inserted = 0

while insertable_nodes_tsvector:
logging.info("Inserting contentnode tsvectors.")

insert_objs = list()
for node in insertable_nodes_tsvector:
obj = ContentNodeFullTextSearch(contentnode_id=node["id"], channel_id=node["channel_id"],
keywords_tsvector=node["keywords_tsvector"], author_tsvector=node["author_tsvector"])
insert_objs.append(obj)

inserted_objs_list = ContentNodeFullTextSearch.objects.bulk_create(insert_objs)

current_inserts_count = len(inserted_objs_list)
total_tsvectors_inserted = total_tsvectors_inserted + current_inserts_count

logging.info("Inserted {} contentnode tsvectors.".format(current_inserts_count))

insertable_nodes_tsvector = list(tsvector_node_query[:CHUNKSIZE])

logging.info("Completed! Successfully inserted total of {} contentnode tsvectors in {} seconds.".format(total_tsvectors_inserted, time.time() - start))
54 changes: 54 additions & 0 deletions contentcuration/search/migrations/0003_fulltextsearch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Generated by Django 3.2.14 on 2022-09-16 08:55
import uuid

import django.contrib.postgres.indexes
import django.contrib.postgres.search
import django.db.models.deletion
from django.contrib.postgres.operations import AddIndexConcurrently
from django.db import migrations
from django.db import models

import contentcuration.models


class Migration(migrations.Migration):

atomic = False

dependencies = [
('contentcuration', '0140_delete_task'),
('search', '0002_auto_20201215_2110'),
]

operations = [
migrations.CreateModel(
name='ContentNodeFullTextSearch',
fields=[
('id', contentcuration.models.UUIDField(default=uuid.uuid4, max_length=32, primary_key=True, serialize=False)),
('keywords_tsvector', django.contrib.postgres.search.SearchVectorField(blank=True, null=True)),
('author_tsvector', django.contrib.postgres.search.SearchVectorField(blank=True, null=True)),
('channel', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='channel_nodes_fts', to='contentcuration.channel')),
('contentnode', models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, related_name='node_fts', to='contentcuration.contentnode')),
],
),
migrations.CreateModel(
name='ChannelFullTextSearch',
fields=[
('id', contentcuration.models.UUIDField(default=uuid.uuid4, max_length=32, primary_key=True, serialize=False)),
('keywords_tsvector', django.contrib.postgres.search.SearchVectorField(blank=True, null=True)),
('channel', models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, related_name='channel_fts', to='contentcuration.channel')),
],
),
AddIndexConcurrently(
model_name='contentnodefulltextsearch',
index=django.contrib.postgres.indexes.GinIndex(fields=['keywords_tsvector'], name='node_keywords_tsv__gin_idx'),
),
AddIndexConcurrently(
model_name='contentnodefulltextsearch',
index=django.contrib.postgres.indexes.GinIndex(fields=['author_tsvector'], name='node_author_tsv__gin_idx'),
),
AddIndexConcurrently(
model_name='channelfulltextsearch',
index=django.contrib.postgres.indexes.GinIndex(fields=['keywords_tsvector'], name='channel_keywords_tsv__gin_idx'),
),
]
40 changes: 40 additions & 0 deletions contentcuration/search/models.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
import uuid

from django.conf import settings
from django.contrib.postgres.indexes import GinIndex
from django.contrib.postgres.search import SearchVectorField
from django.db import models

from contentcuration.models import Channel
from contentcuration.models import ContentNode
from contentcuration.models import UUIDField as StudioUUIDField


class SavedSearch(models.Model):
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
Expand All @@ -13,3 +19,37 @@ class SavedSearch(models.Model):
saved_by = models.ForeignKey(
settings.AUTH_USER_MODEL, related_name="searches", on_delete=models.CASCADE
)


class ContentNodeFullTextSearch(models.Model):
id = StudioUUIDField(primary_key=True, default=uuid.uuid4)

# The contentnode that this record points to.
contentnode = models.OneToOneField(ContentNode, on_delete=models.CASCADE, related_name="node_fts")

# The channel to which the contentnode belongs. Channel cannot be NULL because we only allow
# searches to be made inside channels.
channel = models.ForeignKey(Channel, on_delete=models.CASCADE, related_name="channel_nodes_fts")

# This stores the keywords as tsvector.
keywords_tsvector = SearchVectorField(null=True, blank=True)

# This stores the author as tsvector.
author_tsvector = SearchVectorField(null=True, blank=True)

class Meta:
indexes = [GinIndex(fields=["keywords_tsvector"], name="node_keywords_tsv__gin_idx"),
GinIndex(fields=["author_tsvector"], name="node_author_tsv__gin_idx")]


class ChannelFullTextSearch(models.Model):
id = StudioUUIDField(primary_key=True, default=uuid.uuid4)

# The channel to which this record points.
channel = models.OneToOneField(Channel, on_delete=models.CASCADE, related_name="channel_fts")

# This stores the channel keywords as tsvector for super fast searches.
keywords_tsvector = SearchVectorField(null=True, blank=True)

class Meta:
indexes = [GinIndex(fields=["keywords_tsvector"], name="channel_keywords_tsv__gin_idx")]