[ckan-changes] commit/ckan: 2 new changesets
Bitbucket
commits-noreply at bitbucket.org
Wed Aug 17 14:08:53 UTC 2011
2 new changesets in ckan:
http://bitbucket.org/okfn/ckan/changeset/55f7546954bd/
changeset: 55f7546954bd
branch: feature-1275-solr-search
user: John Glover
date: 2011-08-17 15:59:07
summary: [test] ckan.site_id should ideally be set when using solr search, so add to test config
affected #: 1 file (25 bytes)
--- a/test-core.ini Mon Aug 15 18:06:04 2011 +0100
+++ b/test-core.ini Wed Aug 17 14:59:07 2011 +0100
@@ -19,6 +19,7 @@
ckan.tests.functional.test_cache.expires = 1800
ckan.tests.functional.test_cache.TestCacheBasics.test_get_cache_expires.expires = 3600
+ckan.site_id = ckan_test
ckan.site_title = CKAN
ckan.site_logo = /images/ckan_logo_fullname_long.png
package_form = standard
http://bitbucket.org/okfn/ckan/changeset/53c6c738771d/
changeset: 53c6c738771d
branch: feature-1275-solr-search
user: John Glover
date: 2011-08-17 16:07:22
summary: Add solr search to core (postgres is still the default)
affected #: 10 files (43.2 KB)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ckan/config/schema.xml Wed Aug 17 15:07:22 2011 +0100
@@ -0,0 +1,162 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<schema name="ckan" version="1.2">
+
+<types>
+ <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
+ <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true"/>
+ <fieldtype name="binary" class="solr.BinaryField"/>
+ <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="date" class="solr.TrieDateField" omitNorms="true" precisionStep="0" positionIncrementGap="0"/>
+ <fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/>
+
+ <fieldType name="text" class="solr.TextField" positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <!-- in this example, we will only use synonyms at query time
+ <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+ -->
+ <!-- Case insensitive stop word removal.
+ add enablePositionIncrements=true in both the index and query
+ analyzers to leave a 'gap' for more accurate phrase queries.
+ -->
+ <filter class="solr.StopFilterFactory"
+ ignoreCase="true"
+ words="stopwords.txt"
+ enablePositionIncrements="true"
+ />
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+ <filter class="solr.StopFilterFactory"
+ ignoreCase="true"
+ words="stopwords.txt"
+ enablePositionIncrements="true"
+ />
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
+ </analyzer>
+ </fieldType>
+
+
+ <!-- A general unstemmed text field - good if one does not know the language of the field -->
+ <fieldType name="textgen" class="solr.TextField" positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+ <filter class="solr.StopFilterFactory"
+ ignoreCase="true"
+ words="stopwords.txt"
+ enablePositionIncrements="true"
+ />
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldType>
+</types>
+
+
+<fields>
+ <field name="id" type="string" indexed="true" stored="true" required="true" />
+ <field name="site_id" type="string" indexed="true" stored="true" required="true" />
+ <field name="title" type="text" indexed="true" stored="true" />
+ <field name="entity_type" type="string" indexed="true" stored="true" omitNorms="true" />
+ <field name="state" type="string" indexed="true" stored="true" omitNorms="true" />
+ <field name="name" type="string" indexed="true" stored="true" omitNorms="true" />
+ <field name="revision_id" type="string" indexed="true" stored="true" omitNorms="true" />
+ <field name="version" type="string" indexed="true" stored="true" />
+ <field name="url" type="string" indexed="true" stored="true" omitNorms="true" />
+ <field name="ckan_url" type="string" indexed="true" stored="true" omitNorms="true" />
+ <field name="download_url" type="string" indexed="true" stored="true" omitNorms="true" />
+ <field name="notes" type="text" indexed="true" stored="true"/>
+ <field name="author" type="textgen" indexed="true" stored="true" />
+ <field name="author_email" type="textgen" indexed="true" stored="true" />
+ <field name="maintainer" type="textgen" indexed="true" stored="true" />
+ <field name="maintainer_email" type="textgen" indexed="true" stored="true" />
+ <field name="license" type="string" indexed="true" stored="true" />
+ <field name="license_id" type="string" indexed="true" stored="true" />
+ <field name="ratings_count" type="int" indexed="true" stored="false" />
+ <field name="ratings_average" type="float" indexed="true" stored="false" />
+ <field name="tags" type="string" indexed="true" stored="true" multiValued="true"/>
+ <field name="groups" type="string" indexed="true" stored="true" multiValued="true"/>
+
+ <field name="res_description" type="textgen" indexed="true" stored="true" multiValued="true"/>
+ <field name="res_format" type="string" indexed="true" stored="true" multiValued="true"/>
+ <field name="res_url" type="string" indexed="true" stored="true" multiValued="true"/>
+
+ <!-- catchall field, containing all other searchable text fields (implemented
+ via copyField further on in this schema -->
+ <field name="text" type="text" indexed="true" stored="false" multiValued="true"/>
+ <field name="urls" type="text" indexed="true" stored="false" multiValued="true"/>
+
+ <field name="depends_on" type="text" indexed="true" stored="false" multiValued="true"/>
+ <field name="dependency_of" type="text" indexed="true" stored="false" multiValued="true"/>
+ <field name="derives_from" type="text" indexed="true" stored="false" multiValued="true"/>
+ <field name="has_derivation" type="text" indexed="true" stored="false" multiValued="true"/>
+ <field name="links_to" type="text" indexed="true" stored="false" multiValued="true"/>
+ <field name="linked_from" type="text" indexed="true" stored="false" multiValued="true"/>
+ <field name="child_of" type="text" indexed="true" stored="false" multiValued="true"/>
+ <field name="parent_of" type="text" indexed="true" stored="false" multiValued="true"/>
+ <field name="extras_*" type="text" indexed="true" stored="false" multiValued="true"/>
+
+ <field name="indexed_ts" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>
+
+ <dynamicField name="*" type="string" indexed="true" stored="false"/>
+</fields>
+
+<uniqueKey>id</uniqueKey>
+<defaultSearchField>text</defaultSearchField>
+<solrQueryParser defaultOperator="AND"/>
+
+<copyField source="url" dest="urls"/>
+<copyField source="ckan_url" dest="urls"/>
+<copyField source="download_url" dest="urls"/>
+<copyField source="res_url" dest="urls"/>
+<copyField source="extras_*" dest="text"/>
+<copyField source="urls" dest="text"/>
+<copyField source="name" dest="text"/>
+<copyField source="title" dest="text"/>
+<copyField source="text" dest="text"/>
+<copyField source="license" dest="text"/>
+<copyField source="notes" dest="text"/>
+<copyField source="tags" dest="text"/>
+<copyField source="groups" dest="text"/>
+<copyField source="res_description" dest="text"/>
+<copyField source="maintainer" dest="text"/>
+<copyField source="author" dest="text"/>
+
+</schema>
--- a/ckan/lib/cli.py Wed Aug 17 14:59:07 2011 +0100
+++ b/ckan/lib/cli.py Wed Aug 17 15:07:22 2011 +0100
@@ -235,6 +235,7 @@
search-index rebuild - indexes all packages (default)
search-index check - checks for packages not indexed
search-index show {package-name} - shows index of a package
+ search-index clear - clears the search index for this ckan instance
'''
summary = __doc__.split('\n')[0]
@@ -244,7 +245,7 @@
def command(self):
self._load_config()
- from ckan.lib.search import rebuild, check, show
+ from ckan.lib.search import rebuild, check, show, clear
if not self.args:
# default to run
@@ -261,6 +262,8 @@
import pdb; pdb.set_trace()
self.args
show(self.args[1])
+ elif cmd == 'clear':
+ clear()
else:
print 'Command %s not recognized' % cmd
--- a/ckan/lib/search/__init__.py Wed Aug 17 14:59:07 2011 +0100
+++ b/ckan/lib/search/__init__.py Wed Aug 17 15:07:22 2011 +0100
@@ -2,6 +2,7 @@
import pkg_resources
from pylons import config
from common import QueryOptions, SearchError, SearchQuery, SearchBackend, SearchIndex
+from solr_backend import SolrSearchBackend
from worker import dispatch_by_operation
log = logging.getLogger(__name__)
@@ -69,6 +70,13 @@
package_index = backend.index_for(model.Package)
print package_index.get_index(package_reference)
+def clear():
+ from ckan import model
+ backend = get_backend()
+ log.debug("Clearing search index...")
+ package_index = backend.index_for(model.Package)
+ package_index.clear()
+
def query_for(_type, backend=None):
""" Query for entities of a specified type (name, class, instance). """
return get_backend(backend=backend).query_for(_type)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ckan/lib/search/solr_backend.py Wed Aug 17 15:07:22 2011 +0100
@@ -0,0 +1,90 @@
+from pylons import config
+from ckan.lib.search import SearchBackend, SearchQuery, SearchIndex, \
+ SearchError
+from ckan.authz import Authorizer
+from ckan import model
+from solr_indexing import make_connection, index_package, delete_package, \
+ clear_index
+import logging
+log = logging.getLogger(__name__)
+
+
+class SolrSearchBackend(SearchBackend):
+
+ def _setup(self):
+ self.register(model.Package.__name__, PackageSolrSearchIndex, PackageSolrSearchQuery)
+
+class PackageSolrSearchQuery(SearchQuery):
+
+ def _run(self):
+ fq = ""
+
+ # Filter for options
+ if self.options.filter_by_downloadable:
+ fq += u" +res_url:[* TO *] " # not null resource URL
+ if self.options.filter_by_openness:
+ licenses = ["license_id:%s" % id for id in self.open_licenses]
+ licenses = " OR ".join(licenses)
+ fq += " +(%s) " % licenses
+
+ order_by = self.options.order_by
+ if order_by == 'rank' or order_by is None:
+ order_by = 'score'
+
+ # show only results from this CKAN instance:
+ fq = fq + " +site_id:\"%s\" " % config.get('ckan.site_id')
+
+ # Filter for package status
+ fq += "+state:active "
+
+ # configurable for iati: full options list
+ facet_limit = int(config.get('search.facets.limit', '50'))
+
+ # query
+ query = self.query.query
+ if (not query) or (not query.strip()):
+ # no query terms, i.e. all documents
+ query = '*:*'
+
+ conn = make_connection(config)
+ try:
+ data = conn.query(query,
+ fq=fq,
+ # make sure data.facet_counts is set:
+ facet='true',
+ facet_limit=facet_limit,
+ facet_field=self.facet_by,
+ facet_mincount=1,
+ start=self.options.offset,
+ rows=self.options.limit,
+ fields='id,score',
+ sort_order='desc',
+ sort=order_by)
+
+ except Exception, e:
+ # this wrapping will be caught further up in the WUI.
+ log.exception(e)
+ raise SearchError(e)
+ finally:
+ conn.close()
+
+ self.count = int(data.numFound)
+ scores = dict([(r.get('id'), r.get('score')) for r in data.results])
+ q = Authorizer().authorized_query(self.options.username, model.Package)
+ q = q.filter(model.Package.id.in_(scores.keys()))
+ self.facets = data.facet_counts.get('facet_fields', {})
+ self.results = sorted(q, key=lambda r: scores[r.id], reverse=True)
+
+
+class SolrSearchIndex(SearchIndex):
+
+ def clear(self):
+ clear_index(config)
+
+class PackageSolrSearchIndex(SolrSearchIndex):
+
+ def remove_dict(self, pkg_dict):
+ delete_package(pkg_dict, config)
+
+ def update_dict(self, pkg_dict):
+ index_package(pkg_dict, config)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ckan/lib/search/solr_indexing.py Wed Aug 17 15:07:22 2011 +0100
@@ -0,0 +1,113 @@
+import itertools
+import string
+from solr import SolrConnection # == solrpy
+import logging
+log = logging.getLogger(__name__)
+
+TYPE_FIELD = "entity_type"
+PACKAGE_TYPE = "package"
+KEY_CHARS = string.digits + string.letters + "_-"
+
+SOLR_FIELDS = [TYPE_FIELD, "res_url", "text", "urls", "indexed_ts", "site_id"]
+
+RESERVED_FIELDS = SOLR_FIELDS + ["tags", "groups", "res_description",
+ "res_format", "res_url"]
+
+# HACK: this is copied over from model.PackageRelationship
+RELATIONSHIP_TYPES = [(u'depends_on', u'dependency_of'),
+ (u'derives_from', u'has_derivation'),
+ (u'links_to', u'linked_from'),
+ (u'child_of', u'parent_of'),
+ ]
+
+def make_connection(config):
+ url = config.get('solr_url', 'http://localhost:8983/solr')
+ user = config.get('solr_user')
+ password = config.get('solr_password')
+
+ if user is not None and password is not None:
+ return SolrConnection(url, http_user=user, http_pass=password)
+ else:
+ return SolrConnection(url)
+
+
+def index_package(pkg_dict, config):
+ if pkg_dict is None:
+ return
+ if (not pkg_dict.get('state')) or ('active' not in pkg_dict.get('state')):
+ return delete_package(pkg_dict, config)
+ conn = make_connection(config)
+ index_fields = RESERVED_FIELDS + pkg_dict.keys()
+
+ # include the extras in the main namespace
+ extras = pkg_dict.get('extras', {})
+ for (key, value) in extras.items():
+ if isinstance(value, (tuple, list)):
+ value = " ".join(map(unicode, value))
+ key = ''.join([c for c in key if c in KEY_CHARS])
+ pkg_dict['extras_' + key] = value
+ if key not in index_fields:
+ pkg_dict[key] = value
+ if 'extras' in pkg_dict:
+ del pkg_dict['extras']
+
+ # flatten the structure for indexing:
+ for resource in pkg_dict.get('resources', []):
+ for (okey, nkey) in [('description', 'res_description'),
+ ('format', 'res_format'),
+ ('url', 'res_url')]:
+ pkg_dict[nkey] = pkg_dict.get(nkey, []) + [resource.get(okey, u'')]
+ if 'resources' in pkg_dict:
+ del pkg_dict['resources']
+
+ # index relationships as <type>:<object>
+ rel_dict = {}
+ rel_types = list(itertools.chain(RELATIONSHIP_TYPES))
+ for rel in pkg_dict.get('relationships', []):
+ _type = rel.get('type', 'rel')
+ if (_type in pkg_dict.keys()) or (_type not in rel_types):
+ continue
+ rel_dict[_type] = rel_dict.get(_type, []) + [rel.get('object')]
+
+ pkg_dict.update(rel_dict)
+
+ if 'relationships' in pkg_dict:
+ del pkg_dict['relationships']
+
+ pkg_dict[TYPE_FIELD] = PACKAGE_TYPE
+ pkg_dict = dict([(k.encode('ascii', 'ignore'), v) for (k, v) in pkg_dict.items()])
+
+ # mark this CKAN instance as data source:
+ pkg_dict['site_id'] = config.get('ckan.site_id')
+
+ # send to solr:
+ try:
+ conn.add_many([pkg_dict])
+ conn.commit(wait_flush=False, wait_searcher=False)
+ finally:
+ conn.close()
+
+ log.debug("Updated index for %s" % pkg_dict.get('name'))
+
+
+def delete_package(pkg_dict, config):
+ conn = make_connection(config)
+ query = "+%s:%s +id:\"%s\" +site_id:\"%s\"" % (TYPE_FIELD, PACKAGE_TYPE,
+ pkg_dict.get('id'),
+ config.get('ckan.site_id'))
+ try:
+ conn.delete_query(query)
+ conn.commit()
+ finally:
+ conn.close()
+
+
+def clear_index(config):
+ conn = make_connection(config)
+ query = "+site_id:\"%s\"" % (config.get('ckan.site_id'))
+ try:
+ conn.delete_query(query)
+ conn.commit()
+ finally:
+ conn.close()
+
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ckan/lib/search/solr_worker.py Wed Aug 17 15:07:22 2011 +0100
@@ -0,0 +1,16 @@
+__import__("pkg_resources").get_distribution("ckanext-queue>=0.1")
+
+from ckanext.queue.worker import Worker
+from indexing import index_package, delete_package
+
+class SolrIndexingWorker(Worker):
+
+ def consume(self, routing_key, operation, payload):
+ assert 'solr_url' in self.config
+ assert 'ckan.site_id' in self.config
+
+ if routing_key == 'Package':
+ if operation in ['new', 'changed']:
+ index_package(payload, self.config)
+ elif operation == 'deleted':
+ delete_package(payload, self.config)
\ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ckan/tests/lib/test_solr_package_search.py Wed Aug 17 15:07:22 2011 +0100
@@ -0,0 +1,411 @@
+from pylons import config
+from ckan.tests import TestController, CreateTestData
+from ckan import model
+import ckan.lib.search as search
+
+
+class TestSearch(TestController):
+ # 'penguin' is in all test search packages
+ q_all = u'penguin'
+
+ @classmethod
+ def setup_class(cls):
+ model.Session.remove()
+ CreateTestData.create_search_test_data()
+ # now remove a tag so we can test search with deleted tags
+ model.repo.new_revision()
+ gils = model.Package.by_name(u'gils')
+ # an existing tag used only by gils
+ cls.tagname = u'registry'
+ idx = [t.name for t in gils.tags].index(cls.tagname)
+ del gils.tags[idx]
+ model.repo.commit_and_remove()
+ # solr
+ config['search_backend'] = 'solr'
+ search.rebuild()
+ cls.backend = search.get_backend()
+
+ @classmethod
+ def teardown_class(cls):
+ model.repo.rebuild_db()
+ search.get_backend().index_for('Package').clear()
+
+ def _pkg_names(self, result):
+ return ' '.join(result['results'])
+
+ def _check_entity_names(self, result, names_in_result):
+ names = result['results']
+ for name in names_in_result:
+ if name not in names:
+ return False
+ return True
+
+ def test_1_all_records(self):
+ result = self.backend.query_for(model.Package).run(query=self.q_all)
+ assert 'gils' in result['results'], result['results']
+ assert result['count'] == 6, result['count']
+
+ def test_1_name(self):
+ # exact name
+ result = self.backend.query_for(model.Package).run(query=u'gils')
+ assert result['count'] == 1, result
+ assert self._pkg_names(result) == 'gils', result
+
+ def test_1_name_multiple_results(self):
+ result = self.backend.query_for(model.Package).run(query=u'gov')
+ assert self._check_entity_names(result, ('us-gov-images', 'usa-courts-gov')), self._pkg_names(result)
+ assert result['count'] == 4, self._pkg_names(result)
+
+ def test_1_name_token(self):
+ result = self.backend.query_for(model.Package).run(query=u'name:gils')
+ assert self._pkg_names(result) == 'gils', self._pkg_names(result)
+ result = self.backend.query_for(model.Package).run(query=u'title:gils')
+ assert not self._check_entity_names(result, ('gils')), self._pkg_names(result)
+
+ def test_2_title(self):
+ # exact title, one word
+ result = self.backend.query_for(model.Package).run(query=u'Opengov.se')
+ assert self._pkg_names(result) == 'se-opengov', self._pkg_names(result)
+ # multiple words
+ result = self.backend.query_for(model.Package).run(query=u'Government Expenditure')
+ assert self._pkg_names(result) == 'uk-government-expenditure', self._pkg_names(result)
+ # multiple words wrong order
+ result = self.backend.query_for(model.Package).run(query=u'Expenditure Government')
+ assert self._pkg_names(result) == 'uk-government-expenditure', self._pkg_names(result)
+ # multiple words, one doesn't match
+ result = self.backend.query_for(model.Package).run(query=u'Expenditure Government China')
+ assert len(result['results']) == 0, self._pkg_names(result)
+
+ def test_3_licence(self):
+ # this should result, but it is here to check that at least it does not error
+ result = self.backend.query_for(model.Package).run(query=u'license:"OKD::Other (PublicsDomain)"')
+ assert result['count'] == 0, result
+
+ def test_quotation(self):
+ # multiple words quoted
+ result = self.backend.query_for(model.Package).run(query=u'"Government Expenditure"')
+ assert self._pkg_names(result) == 'uk-government-expenditure', self._pkg_names(result)
+ # multiple words quoted wrong order
+ result = self.backend.query_for(model.Package).run(query=u'"Expenditure Government"')
+ assert self._pkg_names(result) == '', self._pkg_names(result)
+
+ def test_string_not_found(self):
+ result = self.backend.query_for(model.Package).run(query=u'randomthing')
+ assert self._pkg_names(result) == '', self._pkg_names(result)
+
+ def test_tags_field(self):
+ result = self.backend.query_for(model.Package).run(query=u'country-sweden')
+ assert self._check_entity_names(result, ['se-publications', 'se-opengov']), self._pkg_names(result)
+
+ def test_tags_token_simple(self):
+ result = self.backend.query_for(model.Package).run(query=u'tags:country-sweden')
+ assert self._check_entity_names(result, ['se-publications', 'se-opengov']), self._pkg_names(result)
+ result = self.backend.query_for(model.Package).run(query=u'tags:wildlife')
+ assert self._pkg_names(result) == 'us-gov-images', self._pkg_names(result)
+
+ def test_tags_token_simple_with_deleted_tag(self):
+ # registry has been deleted
+ result = self.backend.query_for(model.Package).run(query=u'tags:registry')
+ assert self._pkg_names(result) == '', self._pkg_names(result)
+
+ def test_tags_token_multiple(self):
+ result = self.backend.query_for(model.Package).run(query=u'tags:country-sweden tags:format-pdf')
+ assert self._pkg_names(result) == 'se-publications', self._pkg_names(result)
+
+ def test_tags_token_complicated(self):
+ result = self.backend.query_for(model.Package).run(query=u'tags:country-sweden tags:somethingrandom')
+ assert self._pkg_names(result) == '', self._pkg_names(result)
+
+ def test_pagination(self):
+ # large search
+ all_results = self.backend.query_for(model.Package).run(query=self.q_all)
+ all_pkgs = all_results['results']
+ all_pkg_count = all_results['count']
+
+ # limit
+ options = search.QueryOptions()
+ options.limit = 2
+ result = self.backend.query_for(model.Package).run(query=self.q_all, options=options)
+ pkgs = result['results']
+ count = result['count']
+ assert len(pkgs) == 2, pkgs
+ assert count == all_pkg_count
+ assert pkgs == all_pkgs[:2]
+
+ # offset
+ options = search.QueryOptions()
+ options.limit = 2
+ options.offset = 2
+ result = self.backend.query_for(model.Package).run(query=self.q_all, options=options)
+ pkgs = result['results']
+ assert len(pkgs) == 2, pkgs
+ assert pkgs == all_pkgs[2:4]
+
+ # larger offset
+ options = search.QueryOptions()
+ options.limit = 2
+ options.offset = 4
+ result = self.backend.query_for(model.Package).run(query=self.q_all, options=options)
+ pkgs = result['results']
+ assert len(pkgs) == 2, pkgs
+ assert pkgs == all_pkgs[4:6]
+
+ def test_order_by(self):
+ # large search
+ all_results = self.backend.query_for(model.Package).run(query=self.q_all)
+ all_pkgs = all_results['results']
+ all_pkg_count = all_results['count']
+
+ # rank
+ # TODO: fix this test
+ # options = search.QueryOptions()
+ # options.order_by = 'rank'
+ # result = self.backend.query_for(model.Package).run(query='penguin', options=options)
+ # pkgs = result['results']
+ # fields = [model.Package.by_name(pkg_name).name for pkg_name in pkgs]
+ # assert fields[0] == 'usa-courts-gov', fields # has penguin three times
+ # assert pkgs == all_pkgs, pkgs #default ordering
+
+ # name
+ options = search.QueryOptions()
+ options.order_by = 'name'
+ result = self.backend.query_for(model.Package).run(query=self.q_all, options=options)
+ pkgs = result['results']
+ fields = [model.Package.by_name(pkg_name).name for pkg_name in pkgs]
+ sorted_fields = fields; sorted_fields.sort()
+ assert fields == sorted_fields, repr(fields) + repr(sorted_fields)
+
+ # title
+ options = search.QueryOptions()
+ options.order_by = 'title'
+ result = self.backend.query_for(model.Package).run(query=self.q_all, options=options)
+ pkgs = result['results']
+ fields = [model.Package.by_name(pkg_name).title for pkg_name in pkgs]
+ sorted_fields = fields; sorted_fields.sort()
+ assert fields == sorted_fields, repr(fields) + repr(sorted_fields)
+
+ # notes
+ options = search.QueryOptions()
+ options.order_by = 'notes'
+ result = self.backend.query_for(model.Package).run(query=self.q_all, options=options)
+ pkgs = result['results']
+ fields = [model.Package.by_name(pkg_name).notes for pkg_name in pkgs]
+ sorted_fields = fields; sorted_fields.sort()
+ assert fields == sorted_fields, repr(fields) + repr(sorted_fields)
+
+ # extra field
+ options = search.QueryOptions()
+ options.order_by = 'date_released'
+ result = self.backend.query_for(model.Package).run(query=self.q_all, options=options)
+ pkgs = result['results']
+ fields = [model.Package.by_name(pkg_name) for pkg_name in pkgs]
+ fields = [field.extras.get('date_released') for field in fields]
+ sorted_fields = fields; sorted_fields.sort()
+ assert fields == sorted_fields, repr(fields) + repr(sorted_fields)
+
+ def test_search_notes_on(self):
+ result = self.backend.query_for(model.Package).run(query=u'restrictions')
+ pkgs = result['results']
+ count = result['count']
+ assert len(pkgs) == 2, pkgs
+
+ def test_search_foreign_chars(self):
+ result = self.backend.query_for(model.Package).run(query='umlaut')
+ assert result['results'] == ['gils'], result['results']
+ result = self.backend.query_for(model.Package).run(query=u'thumb')
+ assert result['count'] == 0, result['results']
+ result = self.backend.query_for(model.Package).run(query=u'th\xfcmb')
+ assert result['results'] == ['gils'], result['results']
+
+ def test_groups(self):
+ result = self.backend.query_for(model.Package).run(query=u'groups:random')
+ assert self._pkg_names(result) == '', self._pkg_names(result)
+ result = self.backend.query_for(model.Package).run(query=u'groups:ukgov')
+ assert result['count'] == 4, self._pkg_names(result)
+ result = self.backend.query_for(model.Package).run(query=u'groups:ukgov tags:us')
+ assert result['count'] == 2, self._pkg_names(result)
+
+class TestSearchOverall(TestController):
+ @classmethod
+ def setup_class(cls):
+ CreateTestData.create()
+ config['search_backend'] = 'solr'
+ search.rebuild()
+ cls.backend = search.get_backend()
+
+ @classmethod
+ def teardown_class(cls):
+ model.repo.rebuild_db()
+ search.get_backend().index_for('Package').clear()
+
+ def _check_search_results(self, terms, expected_count, expected_packages=[], only_open=False, only_downloadable=False):
+ options = search.QueryOptions()
+ options.filter_by_openness = only_open
+ options.filter_by_downloadable = only_downloadable
+ result = self.backend.query_for(model.Package).run(query=unicode(terms))
+ pkgs = result['results']
+ count = result['count']
+ assert count == expected_count, (count, expected_count)
+ for expected_pkg in expected_packages:
+ assert expected_pkg in pkgs, '%s : %s' % (expected_pkg, result)
+
+ def test_overall(self):
+ print 'test_overall'
+ self._check_search_results('annakarenina', 1, ['annakarenina'])
+ self._check_search_results('warandpeace', 1, ['warandpeace'])
+ self._check_search_results('', 2)
+ self._check_search_results('A Novel By Tolstoy', 1, ['annakarenina'])
+ self._check_search_results('title:Novel', 1, ['annakarenina'])
+ self._check_search_results('title:peace', 0)
+ self._check_search_results('name:warandpeace', 1)
+ self._check_search_results('groups:david', 2)
+ self._check_search_results('groups:roger', 1)
+ self._check_search_results('groups:lenny', 0)
+ self._check_search_results('annakarenina', 1, ['annakarenina'], True, False)
+ self._check_search_results('annakarenina', 1, ['annakarenina'], False, True)
+ self._check_search_results('annakarenina', 1, ['annakarenina'], True, True)
+
+
+class TestGeographicCoverage(TestController):
+ @classmethod
+ def setup_class(cls):
+ init_data = [
+ {'name':'eng',
+ 'extras':{'geographic_coverage':'100000: England'},},
+ {'name':'eng_ni',
+ 'extras':{'geographic_coverage':'100100: England, Northern Ireland'},},
+ {'name':'uk',
+ 'extras':{'geographic_coverage':'111100: United Kingdom (England, Scotland, Wales, Northern Ireland'},},
+ {'name':'gb',
+ 'extras':{'geographic_coverage':'111000: Great Britain (England, Scotland, Wales)'},},
+ {'name':'none',
+ 'extras':{'geographic_coverage':'000000:'},},
+ ]
+ CreateTestData.create_arbitrary(init_data)
+ config['search_backend'] = 'solr'
+ search.rebuild()
+ cls.backend = search.get_backend()
+
+ @classmethod
+ def teardown_class(self):
+ model.repo.rebuild_db()
+ search.get_backend().index_for('Package').clear()
+
+ def _do_search(self, q, expected_pkgs, count=None):
+ options = search.QueryOptions()
+ options.order_by = 'rank'
+ result = self.backend.query_for(model.Package).run(query=q, options=options)
+ pkgs = result['results']
+ fields = [model.Package.by_name(pkg_name).name for pkg_name in pkgs]
+ if not (count is None):
+ assert result['count'] == count, result['count']
+ for expected_pkg in expected_pkgs:
+ assert expected_pkg in fields, expected_pkg
+
+ def _filtered_search(self, value, expected_pkgs, count=None):
+ options = search.QueryOptions()
+ options.order_by = 'rank'
+ result = self.backend.query_for(model.Package).run(fields={'geographic_coverage':value}, options=options)
+ pkgs = result['results']
+ fields = [model.Package.by_name(pkg_name).name for pkg_name in pkgs]
+ if not (count is None):
+ assert result['count'] == count, result['count']
+ for expected_pkg in expected_pkgs:
+ assert expected_pkg in fields, expected_pkg
+
+ def test_0_basic(self):
+ self._do_search(u'england', ['eng', 'eng_ni', 'uk', 'gb'], 4)
+ self._do_search(u'northern ireland', ['eng_ni', 'uk'], 2)
+ self._do_search(u'united kingdom', ['uk'], 1)
+ self._do_search(u'great britain', ['gb'], 1)
+
+ # TODO: solr is not currently set up to allow partial matches
+ # and extras are not saved as multivalued so this
+ # test will fail. Make multivalued or remove?
+ # def test_1_filtered(self):
+ # self._filtered_search(u'england', ['eng', 'eng_ni', 'uk', 'gb'], 4)
+
+class TestExtraFields(TestController):
+ @classmethod
+ def setup_class(cls):
+ init_data = [
+ {'name':'a',
+ 'extras':{'department':'abc',
+ 'agency':'ag-a'},},
+ {'name':'b',
+ 'extras':{'department':'bcd',
+ 'agency':'ag-b'},},
+ {'name':'c',
+ 'extras':{'department':'cde abc'},},
+ {'name':'none',
+ 'extras':{'department':''},},
+ ]
+ CreateTestData.create_arbitrary(init_data)
+ config['search_backend'] = 'solr'
+ search.rebuild()
+ cls.backend = search.get_backend()
+
+ @classmethod
+ def teardown_class(self):
+ model.repo.rebuild_db()
+ search.get_backend().index_for('Package').clear()
+
+ def _do_search(self, department, expected_pkgs, count=None):
+ result = self.backend.query_for(model.Package).run(fields={'department': department})
+ pkgs = result['results']
+ fields = [model.Package.by_name(pkg_name).name for pkg_name in pkgs]
+ if not (count is None):
+ assert result['count'] == count, result['count']
+ for expected_pkg in expected_pkgs:
+ assert expected_pkg in fields, expected_pkg
+
+ def test_0_basic(self):
+ self._do_search(u'bcd', 'b', 1)
+ self._do_search(u'cde abc', 'c', 1)
+ # TODO: solr is not currently set up to allow partial matches
+ # and extras are not saved as multivalued so these
+ # tests will fail. Make multivalued or remove these?
+ # self._do_search(u'abc', ['a', 'c'], 2)
+ # self._do_search(u'cde', 'c', 1)
+ # self._do_search(u'abc cde', 'c', 1)
+
+class TestRank(TestController):
+ @classmethod
+ def setup_class(cls):
+ init_data = [{'name':u'test1-penguin-canary',
+ 'tags':u'canary goose squirrel wombat wombat'},
+ {'name':u'test2-squirrel-squirrel-canary-goose',
+ 'tags':u'penguin wombat'},
+ ]
+ CreateTestData.create_arbitrary(init_data)
+ cls.pkg_names = [
+ u'test1-penguin-canary',
+ u'test2-squirrel-squirrel-canary-goose'
+ ]
+ config['search_backend'] = 'solr'
+ search.rebuild()
+ cls.backend = search.get_backend()
+
+ @classmethod
+ def teardown_class(self):
+ model.repo.rebuild_db()
+ search.get_backend().index_for('Package').clear()
+
+ def _do_search(self, q, wanted_results):
+ options = search.QueryOptions()
+ options.order_by = 'rank'
+ result = self.backend.query_for(model.Package).run(query=q, options=options)
+ results = result['results']
+ err = 'Wanted %r, got %r' % (wanted_results, results)
+ assert wanted_results[0] == results[0], err
+ assert wanted_results[1] == results[1], err
+
+ def test_0_basic(self):
+ self._do_search(u'wombat', self.pkg_names)
+ self._do_search(u'squirrel', self.pkg_names[::-1])
+ self._do_search(u'canary', self.pkg_names)
+
+ # TODO: fix this test
+ # def test_1_weighting(self):
+ # self._do_search(u'penguin', self.pkg_names)
+ # self._do_search(u'goose', self.pkg_names[::-1])
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ckan/tests/lib/test_solr_package_search_synchronous_update.py Wed Aug 17 15:07:22 2011 +0100
@@ -0,0 +1,124 @@
+from pylons import config
+from ckan import plugins, model
+import ckan.lib.search as search
+from ckan.tests import CreateTestData
+from test_solr_package_search import TestSearchOverall
+
+class TestSearchOverallWithSynchronousIndexing(TestSearchOverall):
+ '''Repeat test from test_package_search with synchronous indexing
+ '''
+
+ @classmethod
+ def setup_class(cls):
+ # Force a garbage collection to trigger issue #695
+ import gc
+ gc.collect()
+
+ CreateTestData.create()
+ config['search_backend'] = 'solr'
+ search.rebuild()
+ plugins.load('synchronous_search')
+ cls.backend = search.get_backend()
+
+ cls.new_pkg_dict = {
+ "name": "council-owned-litter-bins",
+ "notes": "Location of Council owned litter bins within Borough.",
+ "resources": [{"description": "Resource locator",
+ "format": "Unverified",
+ "url": "http://www.barrowbc.gov.uk"}],
+ "tags": ["Utility and governmental services"],
+ "title": "Council Owned Litter Bins",
+ "extras": {
+ "INSPIRE": "True",
+ "bbox-east-long": "-3.12442",
+ "bbox-north-lat": "54.218407",
+ "bbox-south-lat": "54.039634",
+ "bbox-west-long": "-3.32485",
+ "constraint": "conditions unknown; (e) intellectual property rights;",
+ "dataset-reference-date": [{"type": "creation",
+ "value": "2008-10-10"},
+ {"type": "revision",
+ "value": "2009-10-08"}],
+ "guid": "00a743bf-cca4-4c19-a8e5-e64f7edbcadd",
+ "metadata-date": "2009-10-16",
+ "metadata-language": "eng",
+ "published_by": 0,
+ "resource-type": "dataset",
+ "spatial-reference-system": "wee",
+ "temporal_coverage-from": "1977-03-10T11:45:30",
+ "temporal_coverage-to": "2005-01-15T09:10:00"
+ }
+ }
+
+ @classmethod
+ def teardown_class(cls):
+ model.repo.rebuild_db()
+ search.get_backend().index_for('Package').clear()
+
+ def _create_package(self, package=None):
+ rev = model.repo.new_revision()
+ rev.author = u'tester'
+ rev.message = u'Creating test data'
+ if not package:
+ package = model.Package()
+
+ relationship_attr = ['extras', 'resources', 'tags']
+ package_properties = {}
+ for key, value in self.new_pkg_dict.iteritems():
+ if key not in relationship_attr:
+ setattr(package, key, value)
+
+ tags = self.new_pkg_dict.get('tags', [])
+ for tag in tags:
+ package.add_tag_by_name(tag, autoflush=False)
+
+ for resource_dict in self.new_pkg_dict.get("resources", []):
+ resource = model.Resource(**resource_dict)
+ package.resources[:] = []
+ package.resources.append(resource)
+
+ for key, value in self.new_pkg_dict.get("extras", {}).iteritems():
+ extra = model.PackageExtra(key=key, value=value)
+ package._extras[key] = extra
+
+ model.Session.add(package)
+ model.setup_default_user_roles(package, [])
+ model.repo.commit_and_remove()
+ return package
+
+ def _remove_package(self):
+ package = model.Package.by_name('council-owned-litter-bins')
+ model.Session.delete(package)
+ model.Session.commit()
+
+ def test_01_search_table_count(self):
+ self._check_search_results('', 2)
+
+ def test_02_add_package_from_dict(self):
+ self._create_package()
+ self._check_search_results('', 3)
+ self._check_search_results('wee', 1, ['council-owned-litter-bins'])
+ self._remove_package()
+
+ def test_03_update_package_from_dict(self):
+ self._create_package()
+ package = model.Package.by_name('council-owned-litter-bins')
+ self.new_pkg_dict['name'] = 'new_name'
+ self.new_pkg_dict['extras']['published_by'] = 'meeeee'
+ self._create_package(package)
+ self._check_search_results('', 3)
+ self._check_search_results('meeeee', 1, ['new_name'])
+
+ package = model.Package.by_name('new_name')
+ self.new_pkg_dict['name'] = 'council-owned-litter-bins'
+ self._create_package(package)
+ self._check_search_results('', 3)
+ self._check_search_results('wee', 1, ['council-owned-litter-bins'])
+ self._remove_package()
+
+ def test_04_delete_package_from_dict(self):
+ self._create_package()
+ package = model.Package.by_name('council-owned-litter-bins')
+ assert package
+ self._remove_package()
+ self._check_search_results('', 2)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ckan/tests/lib/test_solr_search_index.py Wed Aug 17 15:07:22 2011 +0100
@@ -0,0 +1,106 @@
+import solr
+from pylons import config
+from ckan import model
+import ckan.lib.search as search
+from ckan.tests import TestController, CreateTestData
+
+class TestSolrConfig(TestController):
+ """
+ Make sure that solr is enabled for this ckan instance.
+ """
+ @classmethod
+ def setup_class(cls):
+ config['search_backend'] = 'solr'
+
+ def test_solr_backend_returned(self):
+ assert isinstance(search.get_backend(), search.SolrSearchBackend),\
+ search.get_backend()
+
+ def test_solr_url_exists(self):
+ assert config.get('solr_url')
+ # solr.SolrConnection will throw an exception if it can't connect
+ solr.SolrConnection(config.get('solr_url'))
+
+
+class TestSearchIndex(TestController):
+ """
+ Tests that a package is indexed when the packagenotification is
+ received by the indexer.
+ """
+ @classmethod
+ def setup_class(cls):
+ CreateTestData.create()
+ cls.solr = solr.SolrConnection(config.get('solr_url'))
+ cls.fq = " +site_id:\"%s\" " % config.get('ckan.site_id')
+
+ @classmethod
+ def teardown_class(cls):
+ model.repo.rebuild_db()
+ cls.solr.close()
+
+ def teardown(self):
+ # clear the search index after every test
+ search.get_backend().index_for('Package').clear()
+
+ def test_index(self):
+ pkg_dict = {
+ 'id': u'penguin-id',
+ 'title': u'penguin',
+ 'state': u'active'
+ }
+ search.dispatch_by_operation('Package', pkg_dict, 'new')
+ response = self.solr.query('title:penguin', fq=self.fq)
+ assert len(response) == 1, len(response)
+ assert response.results[0]['title'] == 'penguin'
+
+ def test_no_state_not_indexed(self):
+ pkg_dict = {
+ 'title': 'penguin'
+ }
+ search.dispatch_by_operation('Package', pkg_dict, 'new')
+ response = self.solr.query('title:penguin', fq=self.fq)
+ assert len(response) == 0, len(response)
+
+ def test_index_clear(self):
+ pkg_dict = {
+ 'id': u'penguin-id',
+ 'title': u'penguin',
+ 'state': u'active'
+ }
+ search.dispatch_by_operation('Package', pkg_dict, 'new')
+ response = self.solr.query('title:penguin', fq=self.fq)
+ assert len(response) == 1, len(response)
+ search.get_backend().index_for('Package').clear()
+ response = self.solr.query('title:penguin', fq=self.fq)
+ assert len(response) == 0
+
+
+class TestSolrSearch:
+ @classmethod
+ def setup_class(cls):
+ CreateTestData.create_search_test_data()
+ cls.solr = solr.SolrConnection(config.get('solr_url'))
+ cls.fq = " +site_id:\"%s\" " % config.get('ckan.site_id')
+ search.rebuild()
+
+ @classmethod
+ def teardown_class(cls):
+ model.repo.rebuild_db()
+ cls.solr.close()
+ search.get_backend().index_for('Package').clear()
+
+ def test_0_indexing(self):
+ """
+ Make sure that all packages created by CreateTestData.create_search_test_data
+ have been added to the search index.
+ """
+ results = self.solr.query('*:*', fq=self.fq)
+ assert len(results) == 6, len(results)
+
+ def test_1_basic(self):
+ results = self.solr.query('sweden', fq=self.fq)
+ assert len(results) == 2
+ result_names = [r['name'] for r in results]
+ assert 'se-publications' in result_names
+ assert 'se-opengov' in result_names
+
--- a/setup.py Wed Aug 17 14:59:07 2011 +0100
+++ b/setup.py Wed Aug 17 15:07:22 2011 +0100
@@ -81,6 +81,7 @@
[ckan.search]
sql = ckan.lib.search.sql:SqlSearchBackend
+ solr = ckan.lib.search.solr_backend:SolrSearchBackend
[ckan.plugins]
synchronous_search = ckan.lib.search.worker:SynchronousSearchPlugin
Repository URL: https://bitbucket.org/okfn/ckan/
--
This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.
More information about the ckan-changes
mailing list