-
-
Notifications
You must be signed in to change notification settings - Fork 301
Add new models for full text search and migration commands #3651
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,16 @@ | ||
| from django.contrib.postgres.search import SearchVector | ||
|
|
||
| # Postgres full text search configuration. We use "simple" to make search | ||
| # language agnostic. | ||
| POSTGRES_FTS_CONFIG = "simple" | ||
|
|
||
| # ContentNode vectors and search fields. | ||
| CONTENTNODE_KEYWORDS_TSVECTOR_FIELDS = ("id", "channel_id", "node_id", "content_id", "tree_id", "title", "description", "contentnode_tags") | ||
| CONTENTNODE_KEYWORDS_TSVECTOR = SearchVector(*CONTENTNODE_KEYWORDS_TSVECTOR_FIELDS, config=POSTGRES_FTS_CONFIG) | ||
|
|
||
| CONTENTNODE_AUTHOR_TSVECTOR_FIELDS = ("author", "aggregator", "provider") | ||
| CONTENTNODE_AUTHOR_TSVECTOR = SearchVector(*CONTENTNODE_AUTHOR_TSVECTOR_FIELDS, config=POSTGRES_FTS_CONFIG) | ||
|
|
||
| # Channel vector and search fields. | ||
| CHANNEL_KEYWORDS_TSVECTOR_FIELDS = ("id", "main_tree__tree_id", "name", "description", "tagline", "primary_channel_token") | ||
| CHANNEL_KEYWORDS_TSVECTOR = SearchVector(*CHANNEL_KEYWORDS_TSVECTOR_FIELDS, config=POSTGRES_FTS_CONFIG) |
Empty file.
Empty file.
57 changes: 57 additions & 0 deletions
57
contentcuration/search/management/commands/set_channel_tsvectors.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,57 @@ | ||
| """ | ||
| This command inserts in bulk channel tsvectors to the ChannelFullTextSearch table. | ||
| """ | ||
| import logging as logmodule | ||
| import time | ||
|
|
||
| from django.core.management.base import BaseCommand | ||
| from django.db.models import Exists | ||
| from django.db.models import OuterRef | ||
| from search.constants import CHANNEL_KEYWORDS_TSVECTOR | ||
| from search.models import ChannelFullTextSearch | ||
|
|
||
| from contentcuration.models import Channel | ||
| from contentcuration.viewsets.channel import primary_token_subquery | ||
|
|
||
|
|
||
| logmodule.basicConfig(level=logmodule.INFO) | ||
| logging = logmodule.getLogger("command") | ||
|
|
||
| CHUNKSIZE = 5000 | ||
|
|
||
|
|
||
| class Command(BaseCommand): | ||
|
|
||
| def handle(self, *args, **options): | ||
| start = time.time() | ||
|
|
||
| channel_not_already_inserted_query = ~Exists(ChannelFullTextSearch.objects.filter(channel_id=OuterRef("id"))) | ||
|
|
||
| channel_query = (Channel.objects.filter(channel_not_already_inserted_query, | ||
| deleted=False, main_tree__published=True) | ||
| .annotate(primary_channel_token=primary_token_subquery, | ||
| keywords_tsvector=CHANNEL_KEYWORDS_TSVECTOR) | ||
| .values("id", "keywords_tsvector")) | ||
|
|
||
| insertable_channels = list(channel_query[:CHUNKSIZE]) | ||
| total_channel_tsvectors_inserted = 0 | ||
|
|
||
| while insertable_channels: | ||
| logging.info("Inserting channel tsvectors.") | ||
|
|
||
| insert_objs = list() | ||
| for channel in insertable_channels: | ||
| obj = ChannelFullTextSearch(channel_id=channel["id"], keywords_tsvector=channel["keywords_tsvector"]) | ||
| insert_objs.append(obj) | ||
|
|
||
| inserted_objs_list = ChannelFullTextSearch.objects.bulk_create(insert_objs) | ||
|
|
||
| current_inserts_count = len(inserted_objs_list) | ||
| total_channel_tsvectors_inserted = total_channel_tsvectors_inserted + current_inserts_count | ||
|
|
||
| logging.info("Inserted {} channel tsvectors.".format(current_inserts_count)) | ||
|
|
||
| insertable_channels = list(channel_query[:CHUNKSIZE]) | ||
|
|
||
| logging.info("Completed! successfully inserted total of {} channel tsvectors in {} seconds.".format( | ||
| total_channel_tsvectors_inserted, time.time() - start)) |
59 changes: 59 additions & 0 deletions
59
contentcuration/search/management/commands/set_contentnode_tsvectors.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,59 @@ | ||
| """ | ||
| This command inserts in bulk contentnode tsvectors to the ContentNodeFullTextSearch table. | ||
| """ | ||
| import logging as logmodule | ||
| import time | ||
|
|
||
| from django.contrib.postgres.aggregates import StringAgg | ||
| from django.core.management.base import BaseCommand | ||
| from django.db.models import Exists | ||
| from django.db.models import OuterRef | ||
| from search.constants import CONTENTNODE_AUTHOR_TSVECTOR | ||
| from search.constants import CONTENTNODE_KEYWORDS_TSVECTOR | ||
| from search.models import ContentNodeFullTextSearch | ||
|
|
||
| from contentcuration.models import ContentNode | ||
|
|
||
|
|
||
| logmodule.basicConfig(level=logmodule.INFO) | ||
| logging = logmodule.getLogger("command") | ||
|
|
||
| CHUNKSIZE = 10000 | ||
|
|
||
|
|
||
| class Command(BaseCommand): | ||
|
|
||
| def handle(self, *args, **options): | ||
| start = time.time() | ||
|
|
||
| tsvector_not_already_inserted_query = ~Exists(ContentNodeFullTextSearch.objects.filter(contentnode_id=OuterRef("id"))) | ||
|
|
||
| tsvector_node_query = (ContentNode._annotate_channel_id(ContentNode.objects) | ||
| .annotate(contentnode_tags=StringAgg("tags__tag_name", delimiter=" "), | ||
| keywords_tsvector=CONTENTNODE_KEYWORDS_TSVECTOR, | ||
| author_tsvector=CONTENTNODE_AUTHOR_TSVECTOR) | ||
| .filter(tsvector_not_already_inserted_query, published=True, channel_id__isnull=False) | ||
| .values("id", "channel_id", "keywords_tsvector", "author_tsvector").order_by()) | ||
|
|
||
| insertable_nodes_tsvector = list(tsvector_node_query[:CHUNKSIZE]) | ||
| total_tsvectors_inserted = 0 | ||
|
|
||
| while insertable_nodes_tsvector: | ||
| logging.info("Inserting contentnode tsvectors.") | ||
|
|
||
| insert_objs = list() | ||
| for node in insertable_nodes_tsvector: | ||
| obj = ContentNodeFullTextSearch(contentnode_id=node["id"], channel_id=node["channel_id"], | ||
| keywords_tsvector=node["keywords_tsvector"], author_tsvector=node["author_tsvector"]) | ||
| insert_objs.append(obj) | ||
|
|
||
| inserted_objs_list = ContentNodeFullTextSearch.objects.bulk_create(insert_objs) | ||
|
|
||
| current_inserts_count = len(inserted_objs_list) | ||
| total_tsvectors_inserted = total_tsvectors_inserted + current_inserts_count | ||
|
|
||
| logging.info("Inserted {} contentnode tsvectors.".format(current_inserts_count)) | ||
|
|
||
| insertable_nodes_tsvector = list(tsvector_node_query[:CHUNKSIZE]) | ||
|
|
||
| logging.info("Completed! Successfully inserted total of {} contentnode tsvectors in {} seconds.".format(total_tsvectors_inserted, time.time() - start)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,54 @@ | ||
| # Generated by Django 3.2.14 on 2022-09-16 08:55 | ||
| import uuid | ||
|
|
||
| import django.contrib.postgres.indexes | ||
| import django.contrib.postgres.search | ||
| import django.db.models.deletion | ||
| from django.contrib.postgres.operations import AddIndexConcurrently | ||
| from django.db import migrations | ||
| from django.db import models | ||
|
|
||
| import contentcuration.models | ||
|
|
||
|
|
||
| class Migration(migrations.Migration): | ||
|
|
||
| atomic = False | ||
|
|
||
| dependencies = [ | ||
| ('contentcuration', '0140_delete_task'), | ||
| ('search', '0002_auto_20201215_2110'), | ||
| ] | ||
|
|
||
| operations = [ | ||
| migrations.CreateModel( | ||
| name='ContentNodeFullTextSearch', | ||
| fields=[ | ||
| ('id', contentcuration.models.UUIDField(default=uuid.uuid4, max_length=32, primary_key=True, serialize=False)), | ||
| ('keywords_tsvector', django.contrib.postgres.search.SearchVectorField(blank=True, null=True)), | ||
| ('author_tsvector', django.contrib.postgres.search.SearchVectorField(blank=True, null=True)), | ||
| ('channel', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='channel_nodes_fts', to='contentcuration.channel')), | ||
| ('contentnode', models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, related_name='node_fts', to='contentcuration.contentnode')), | ||
| ], | ||
| ), | ||
| migrations.CreateModel( | ||
| name='ChannelFullTextSearch', | ||
| fields=[ | ||
| ('id', contentcuration.models.UUIDField(default=uuid.uuid4, max_length=32, primary_key=True, serialize=False)), | ||
| ('keywords_tsvector', django.contrib.postgres.search.SearchVectorField(blank=True, null=True)), | ||
| ('channel', models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, related_name='channel_fts', to='contentcuration.channel')), | ||
| ], | ||
| ), | ||
| AddIndexConcurrently( | ||
| model_name='contentnodefulltextsearch', | ||
| index=django.contrib.postgres.indexes.GinIndex(fields=['keywords_tsvector'], name='node_keywords_tsv__gin_idx'), | ||
| ), | ||
| AddIndexConcurrently( | ||
| model_name='contentnodefulltextsearch', | ||
| index=django.contrib.postgres.indexes.GinIndex(fields=['author_tsvector'], name='node_author_tsv__gin_idx'), | ||
| ), | ||
| AddIndexConcurrently( | ||
| model_name='channelfulltextsearch', | ||
| index=django.contrib.postgres.indexes.GinIndex(fields=['keywords_tsvector'], name='channel_keywords_tsv__gin_idx'), | ||
| ), | ||
| ] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.