[ckan-changes] commit/ckan: 5 new changesets
Bitbucket
commits-noreply at bitbucket.org
Thu Aug 18 13:03:35 UTC 2011
5 new changesets in ckan:
http://bitbucket.org/okfn/ckan/changeset/13d4c5b081d3/
changeset: 13d4c5b081d3
branch: feature-1275-solr-search
user: John Glover
date: 2011-08-18 12:07:57
summary: [solr] Move search indexing code to index.py
affected #: 3 files (7.1 KB)
--- a/ckan/lib/search/__init__.py Wed Aug 17 15:45:52 2011 +0100
+++ b/ckan/lib/search/__init__.py Thu Aug 18 11:07:57 2011 +0100
@@ -1,9 +1,13 @@
import logging
import pkg_resources
from pylons import config
-from common import QueryOptions, SearchError, SearchQuery, SearchBackend, SearchIndex
-from solr_backend import SolrSearchBackend
-from worker import dispatch_by_operation
+from ckan import model
+from ckan.model import DomainObjectOperation
+from ckan.plugins import SingletonPlugin, implements, IDomainObjectModification
+from ckan.lib.dictization.model_dictize import package_to_api1
+from common import QueryOptions, SearchQuery, SearchBackend, SearchIndex
+# from solr_backend import SolrSearchBackend
+from index import PackageSearchIndex, NoopSearchIndex
log = logging.getLogger(__name__)
@@ -21,29 +25,68 @@
'callback': None, # simply passed through
}
-# TODO make sure all backends are thread-safe!
-INSTANCE_CACHE = {}
+_INDICES = {
+ 'package': PackageSearchIndex
+}
-def get_backend(backend=None):
- if backend is None:
- backend = config.get('search_backend', 'sql')
- klass = None
- for ep in pkg_resources.iter_entry_points("ckan.search", backend.strip().lower()):
- klass = ep.load()
- if klass is None:
- raise KeyError("No search backend called %s" % (backend,))
- if not klass in INSTANCE_CACHE.keys():
- log.debug("Creating search backend: %s" % klass.__name__)
- INSTANCE_CACHE[klass] = klass()
- return INSTANCE_CACHE.get(klass)
+def _normalize_type(_type):
+ if isinstance(_type, model.DomainObject):
+ _type = _type.__class__
+ if isinstance(_type, type):
+ _type = _type.__name__
+ return _type.strip().lower()
+
+def index_for(_type):
+ """ Get a SearchIndex instance sub-class suitable for the specified type. """
+ try:
+ _type_n = _normalize_type(_type)
+ return _INDICES[_type_n]()
+ except KeyError, ke:
+ log.warn("Unknown search type: %s" % _type)
+ return NoopSearchIndex()
+
+def query_for(_type):
+ """ Query for entities of a specified type (name, class, instance). """
+ raise Exception("NotYetImplemented")
+
+def dispatch_by_operation(entity_type, entity, operation):
+ """Call the appropriate index method for a given notification."""
+ try:
+ index = index_for(entity_type)
+ if operation == DomainObjectOperation.new:
+ index.insert_dict(entity)
+ elif operation == DomainObjectOperation.changed:
+ index.update_dict(entity)
+ elif operation == DomainObjectOperation.deleted:
+ index.remove_dict(entity)
+ else:
+ log.warn("Unknown operation: %s" % operation)
+ except Exception, ex:
+ log.exception(ex)
+
+class SearchError(Exception): pass
+
+class SynchronousSearchPlugin(SingletonPlugin):
+ """Update the search index automatically."""
+ implements(IDomainObjectModification, inherit=True)
+
+ def notify(self, entity, operation):
+ if operation != DomainObjectOperation.deleted:
+ dispatch_by_operation(entity.__class__.__name__,
+ package_to_api1(entity, {'model': model}),
+ operation)
+ elif operation == DomainObjectOperation.deleted:
+ dispatch_by_operation(entity.__class__.__name__,
+ {'id': entity.id}, operation)
+ else:
+ log.warn("Discarded Sync. indexing for: %s" % entity)
def rebuild():
from ckan import model
- backend = get_backend()
log.debug("Rebuilding search index...")
# Packages
- package_index = backend.index_for(model.Package)
+ package_index = index_for(model.Package)
package_index.clear()
for pkg in model.Session.query(model.Package).all():
package_index.insert_entity(pkg)
@@ -51,8 +94,7 @@
def check():
from ckan import model
- backend = get_backend()
- package_index = backend.index_for(model.Package)
+ package_index = index_for(model.Package)
log.debug("Checking packages search index...")
pkgs_q = model.Session.query(model.Package).filter_by(state=model.State.ACTIVE)
@@ -66,18 +108,11 @@
def show(package_reference):
from ckan import model
- backend = get_backend()
- package_index = backend.index_for(model.Package)
+ package_index = index_for(model.Package)
print package_index.get_index(package_reference)
def clear():
from ckan import model
- backend = get_backend()
log.debug("Clearing search index...")
- package_index = backend.index_for(model.Package)
+ package_index = index_for(model.Package)
package_index.clear()
-
-def query_for(_type, backend=None):
- """ Query for entities of a specified type (name, class, instance). """
- return get_backend(backend=backend).query_for(_type)
-
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ckan/lib/search/index.py Thu Aug 18 11:07:57 2011 +0100
@@ -0,0 +1,161 @@
+from pylons import config
+import itertools
+import string
+from solr import SolrConnection # == solrpy
+import logging
+log = logging.getLogger(__name__)
+
+TYPE_FIELD = "entity_type"
+PACKAGE_TYPE = "package"
+KEY_CHARS = string.digits + string.letters + "_-"
+SOLR_FIELDS = [TYPE_FIELD, "res_url", "text", "urls", "indexed_ts", "site_id"]
+RESERVED_FIELDS = SOLR_FIELDS + ["tags", "groups", "res_description",
+ "res_format", "res_url"]
+# HACK: this is copied over from model.PackageRelationship
+RELATIONSHIP_TYPES = [
+ (u'depends_on', u'dependency_of'),
+ (u'derives_from', u'has_derivation'),
+ (u'links_to', u'linked_from'),
+ (u'child_of', u'parent_of'),
+]
+
+def make_connection(config):
+ url = config.get('solr_url', 'http://localhost:8983/solr')
+ user = config.get('solr_user')
+ password = config.get('solr_password')
+
+ if user is not None and password is not None:
+ return SolrConnection(url, http_user=user, http_pass=password)
+ else:
+ return SolrConnection(url)
+
+def clear_index(config):
+ conn = make_connection(config)
+ query = "+site_id:\"%s\"" % (config.get('ckan.site_id'))
+ try:
+ conn.delete_query(query)
+ conn.commit()
+ finally:
+ conn.close()
+
+class SearchIndex(object):
+ """
+ A search index handles the management of documents of a specific type in the
+ index, but no queries.
+ The default implementation maps many of the methods, so most subclasses will
+ only have to implement ``update_dict`` and ``remove_dict``.
+ """
+
+ def __init__(self):
+ pass
+
+ def insert_dict(self, data):
+ """ Insert new data from a dictionary. """
+ return self.update_dict(data)
+
+ def insert_entity(self, entity):
+ """ Insert new data from a domain object. """
+ return self.insert_dict(entity.as_dict())
+
+ def update_dict(self, data):
+ """ Update data from a dictionary. """
+ log.debug("NOOP Index: %s" % ",".join(data.keys()))
+
+ def update_entity(self, entity):
+ """ Update data from a domain object. """
+ # in convention we trust:
+ return self.update_dict(entity.as_dict())
+
+ def remove_dict(self, data):
+ """ Delete an index entry uniquely identified by ``data``. """
+ log.debug("NOOP Delete: %s" % ",".join(data.keys()))
+
+ def remove_entity(self, entity):
+ """ Delete ``entity``. """
+ return self.remove_dict(entity.as_dict())
+
+ def clear(self):
+ """ Delete the complete index. """
+ clear_index(config)
+
+ def get_all_entity_ids(self):
+ """ Return a list of entity IDs in the index. """
+ raise NotImplemented
+
+class NoopSearchIndex(SearchIndex): pass
+
+class PackageSearchIndex(SearchIndex):
+ def remove_dict(self, pkg_dict):
+ self.delete_package(pkg_dict, config)
+
+ def update_dict(self, pkg_dict):
+ self.index_package(pkg_dict, config)
+
+ def index_package(self, pkg_dict, config):
+ if pkg_dict is None:
+ return
+ if (not pkg_dict.get('state')) or ('active' not in pkg_dict.get('state')):
+ return self.delete_package(pkg_dict, config)
+ conn = make_connection(config)
+ index_fields = RESERVED_FIELDS + pkg_dict.keys()
+
+ # include the extras in the main namespace
+ extras = pkg_dict.get('extras', {})
+ for (key, value) in extras.items():
+ if isinstance(value, (tuple, list)):
+ value = " ".join(map(unicode, value))
+ key = ''.join([c for c in key if c in KEY_CHARS])
+ pkg_dict['extras_' + key] = value
+ if key not in index_fields:
+ pkg_dict[key] = value
+ if 'extras' in pkg_dict:
+ del pkg_dict['extras']
+
+ # flatten the structure for indexing:
+ for resource in pkg_dict.get('resources', []):
+ for (okey, nkey) in [('description', 'res_description'),
+ ('format', 'res_format'),
+ ('url', 'res_url')]:
+ pkg_dict[nkey] = pkg_dict.get(nkey, []) + [resource.get(okey, u'')]
+ if 'resources' in pkg_dict:
+ del pkg_dict['resources']
+
+ # index relationships as <type>:<object>
+ rel_dict = {}
+ rel_types = list(itertools.chain(RELATIONSHIP_TYPES))
+ for rel in pkg_dict.get('relationships', []):
+ _type = rel.get('type', 'rel')
+ if (_type in pkg_dict.keys()) or (_type not in rel_types):
+ continue
+ rel_dict[_type] = rel_dict.get(_type, []) + [rel.get('object')]
+
+ pkg_dict.update(rel_dict)
+
+ if 'relationships' in pkg_dict:
+ del pkg_dict['relationships']
+
+ pkg_dict[TYPE_FIELD] = PACKAGE_TYPE
+ pkg_dict = dict([(k.encode('ascii', 'ignore'), v) for (k, v) in pkg_dict.items()])
+
+ # mark this CKAN instance as data source:
+ pkg_dict['site_id'] = config.get('ckan.site_id')
+
+ # send to solr:
+ try:
+ conn.add_many([pkg_dict])
+ conn.commit(wait_flush=False, wait_searcher=False)
+ finally:
+ conn.close()
+
+ log.debug("Updated index for %s" % pkg_dict.get('name'))
+
+ def delete_package(self, pkg_dict, config):
+ conn = make_connection(config)
+ query = "+%s:%s +id:\"%s\" +site_id:\"%s\"" % (TYPE_FIELD, PACKAGE_TYPE,
+ pkg_dict.get('id'),
+ config.get('ckan.site_id'))
+ try:
+ conn.delete_query(query)
+ conn.commit()
+ finally:
+ conn.close()
--- a/ckan/tests/lib/test_solr_search_index.py Wed Aug 17 15:45:52 2011 +0100
+++ b/ckan/tests/lib/test_solr_search_index.py Thu Aug 18 11:07:57 2011 +0100
@@ -12,17 +12,13 @@
def setup_class(cls):
config['search_backend'] = 'solr'
- def test_solr_backend_returned(self):
- assert isinstance(search.get_backend(), search.SolrSearchBackend),\
- search.get_backend()
-
def test_solr_url_exists(self):
assert config.get('solr_url')
# solr.SolrConnection will throw an exception if it can't connect
solr.SolrConnection(config.get('solr_url'))
-class TestSearchIndex(TestController):
+class TestSolrSearchIndex(TestController):
"""
Tests that a package is indexed when the packagenotification is
received by the indexer.
@@ -41,7 +37,7 @@
def teardown(self):
# clear the search index after every test
- search.get_backend().index_for('Package').clear()
+ search.index_for('Package').clear()
def test_index(self):
pkg_dict = {
@@ -71,7 +67,7 @@
search.dispatch_by_operation('Package', pkg_dict, 'new')
response = self.solr.query('title:penguin', fq=self.fq)
assert len(response) == 1, len(response)
- search.get_backend().index_for('Package').clear()
+ search.index_for('Package').clear()
response = self.solr.query('title:penguin', fq=self.fq)
assert len(response) == 0
@@ -88,7 +84,7 @@
def teardown_class(cls):
model.repo.rebuild_db()
cls.solr.close()
- search.get_backend().index_for('Package').clear()
+ search.index_for('Package').clear()
def test_0_indexing(self):
"""
http://bitbucket.org/okfn/ckan/changeset/bbc7508b7047/
changeset: bbc7508b7047
branch: feature-1275-solr-search
user: John Glover
date: 2011-08-18 14:41:00
summary: [solr] move solr connection function to comon.py
affected #: 2 files (10.0 KB)
--- a/ckan/lib/search/common.py Thu Aug 18 11:07:57 2011 +0100
+++ b/ckan/lib/search/common.py Thu Aug 18 13:41:00 2011 +0100
@@ -1,296 +1,15 @@
+from solr import SolrConnection
import logging
-
-from paste.util.multidict import MultiDict
-from paste.deploy.converters import asbool
-from ckan import model
-
log = logging.getLogger(__name__)
class SearchError(Exception): pass
-class SearchBackend(object):
- """
- A search backend describes the engine used to actually maintain data. This can be
- something like Solr, Xapian, or just a mapping onto SQL queries.
-
- The backend stores a mapping of ``SearchIndex``, ``SearchQuery`` pairs for all
- entity types that are supposed to be queried using this engine.
-
- Entity types can be given as classes, objects or strings that uniquely identify a
- ``DomainObject`` type used in CKAN.
- """
-
- def __init__(self):
- self._typed_queries = {}
- self._typed_indices = {}
- self._setup()
-
- def _setup(self):
- """ This method is overridden by subclasses to actually register handlers """
- pass
-
- def _normalize_type(self, _type):
- if isinstance(_type, model.DomainObject):
- _type = _type.__class__
- if isinstance(_type, type):
- _type = _type.__name__
- return _type.strip().lower()
-
- def register(self, _type, index_class, query_class):
- """ Register a type by setting both query and index classes. """
- _type = self._normalize_type(_type)
- self._typed_queries[_type] = query_class
- self._typed_indices[_type] = index_class
-
- def unregister(self, _type):
- """ TODO: Find out what would possibly use this. """
- _type = self._normalize_type(_type)
- if _type in self._typed_queries:
- del self._typed_queries[_type]
- if _type in self._typed_indices:
- del self._typed_indices[_type]
-
- def query_for(self, _type):
- """ Get a SearchQuery instance sub-class suitable for the specified type. """
- try:
- _type_n = self._normalize_type(_type)
- return self._typed_queries[_type_n](self)
- except KeyError, ke:
- raise SearchError("Unknown search type: %s" % _type)
-
- def index_for(self, _type):
- """ Get a SearchIndex instance sub-class suitable for the specified type. """
- try:
- _type_n = self._normalize_type(_type)
- return self._typed_indices[_type_n](self)
- except KeyError, ke:
- log.warn("Unknown search type: %s" % _type)
- return NoopSearchIndex(self)
-
- def types(self):
- return self._typed_queries.keys()
-
+def make_connection(config):
+ url = config.get('solr_url', 'http://localhost:8983/solr')
+ user = config.get('solr_user')
+ password = config.get('solr_password')
-class SearchQuery(object):
- """
- A query is ... when you ask the search engine things. SearchQuery is intended
- to be used for only one query, i.e. it sets state. Definitely not thread-safe.
- """
-
- def __init__(self, backend):
- self.backend = backend
- self.results = []
- self.count = 0
-
- @property
- def open_licenses(self):
- # backend isn't exactly the very best place to put these, but they stay
- # there persistently.
- # TODO: figure out if they change during run-time.
- if not hasattr(self.backend, '_open_licenses'):
- self.backend._open_licenses = []
- for license in model.Package.get_license_register().values():
- if license and license.isopen():
- self.backend._open_licenses.append(license.id)
- return self.backend._open_licenses
-
- def _format_results(self):
- if not self.options.return_objects and len(self.results):
- if self.options.all_fields:
- self.results = [r.as_dict() for r in self.results]
- else:
- attr_name = self.options.ref_entity_with_attr
- self.results = [getattr(entity, attr_name) for entity in self.results]
-
- def run(self, query=None, terms=[], fields={}, facet_by=[], options=None, **kwargs):
- if options is None:
- options = QueryOptions(**kwargs)
- else:
- options.update(kwargs)
- self.options = options
- self.options.validate()
- self.facet_by = facet_by
- self.facets = dict()
- self.query = QueryParser(query, terms, fields)
- self.query.validate()
- self._run()
- self._format_results()
- return {'results': self.results, 'count': self.count}
-
- def _run(self):
- raise SearchError("SearchQuery._run() not implemented!")
-
- # convenience, allows to query(..)
- __call__ = run
-
-
-class QueryOptions(dict):
- """
- Options specify aspects of the search query which are only tangentially related
- to the query terms (such as limits, etc.).
- """
-
- BOOLEAN_OPTIONS = ['filter_by_downloadable', 'filter_by_openness', 'all_fields']
- INTEGER_OPTIONS = ['offset', 'limit']
-
- def __init__(self, **kwargs):
- from ckan.lib.search import DEFAULT_OPTIONS
-
- # set values according to the defaults
- for option_name, default_value in DEFAULT_OPTIONS.items():
- if not option_name in self:
- self[option_name] = default_value
-
- super(QueryOptions, self).__init__(**kwargs)
-
- def validate(self):
- for key, value in self.items():
- if key in self.BOOLEAN_OPTIONS:
- try:
- value = asbool(value)
- except ValueError:
- raise SearchError('Value for search option %r must be True or False (1 or 0) but received %r' % (key, value))
- elif key in self.INTEGER_OPTIONS:
- try:
- value = int(value)
- except ValueError:
- raise SearchError('Value for search option %r must be an integer but received %r' % (key, value))
- self[key] = value
-
- def __getattr__(self, name):
- return self.get(name)
-
- def __setattr__(self, name, value):
- self[name] = value
-
-
-class QueryParser(object):
- """
- The query parser will take any incoming query specifications and turn
- them into field-specific and general query parts.
- """
-
- def __init__(self, query, terms, fields):
- self._query = query
- self._terms = terms
- self._fields = MultiDict(fields)
-
- @property
- def query(self):
- if not hasattr(self, '_combined_query'):
- parts = [self._query if self._query is not None else '']
-
- for term in self._terms:
- if term.find(u' ') != -1:
- term = u"\"%s\"" % term
- parts.append(term.strip())
-
- for field, value in self._fields.items():
- if value.find(' ') != -1:
- value = u"\"%s\"" % value
- parts.append(u"%s:%s" % (field.strip(), value.strip()))
-
- self._combined_query = u' '.join(parts)
- return self._combined_query
-
- def _query_tokens(self):
- """ Split the query string, leaving quoted strings intact. """
- if self._query:
- inside_quote = False
- buf = u''
- for ch in self._query:
- if ch == u' ' and not inside_quote:
- if len(buf):
- yield buf.strip()
- buf = u''
- elif ch == inside_quote:
- inside_quote = False
- elif ch in [u"\"", u"'"]:
- inside_quote = ch
- else:
- buf += ch
- if len(buf):
- yield buf.strip()
-
- def _parse_query(self):
- """ Decompose the query string into fields and terms. """
- self._combined_fields = MultiDict(self._fields)
- self._combined_terms = list(self._terms)
- for token in self._query_tokens():
- colon_pos = token.find(u':')
- if colon_pos != -1:
- field = token[:colon_pos]
- value = token[colon_pos+1:]
- value = value.strip('"').strip("'").strip()
- self._combined_fields.add(field, value)
- else:
- self._combined_terms.append(token)
-
- @property
- def fields(self):
- if not hasattr(self, '_combined_fields'):
- self._parse_query()
- return self._combined_fields
-
- @property
- def terms(self):
- if not hasattr(self, '_combined_terms'):
- self._parse_query()
- return self._combined_terms
-
- def validate(self):
- """ Check that this is a valid query. """
- pass
-
- def __str__(self):
- return self.query
-
- def __repr__(self):
- return "Query(%r)" % self.query
-
-
-class SearchIndex(object):
- """
- A search index handles the management of documents of a specific type in the
- index, but no queries.
- The default implementation maps many of the methods, so most subclasses will
- only have to implement ``update_dict`` and ``remove_dict``.
- """
-
- def __init__(self, backend):
- self.backend = backend
-
- def insert_dict(self, data):
- """ Insert new data from a dictionary. """
- return self.update_dict(data)
-
- def insert_entity(self, entity):
- """ Insert new data from a domain object. """
- return self.insert_dict(entity.as_dict())
-
- def update_dict(self, data):
- """ Update data from a dictionary. """
- log.debug("NOOP Index: %s" % ",".join(data.keys()))
-
- def update_entity(self, entity):
- """ Update data from a domain object. """
- # in convention we trust:
- return self.update_dict(entity.as_dict())
-
- def remove_dict(self, data):
- """ Delete an index entry uniquely identified by ``data``. """
- log.debug("NOOP Delete: %s" % ",".join(data.keys()))
-
- def remove_entity(self, entity):
- """ Delete ``entity``. """
- return self.remove_dict(entity.as_dict())
-
- def clear(self):
- """ Delete the complete index. """
- log.debug("NOOP Index reset")
-
- def get_all_entity_ids(self):
- """ Return a list of entity IDs in the index. """
- raise NotImplemented
-
-class NoopSearchIndex(SearchIndex): pass
+ if user is not None and password is not None:
+ return SolrConnection(url, http_user=user, http_pass=password)
+ else:
+ return SolrConnection(url)
--- a/ckan/lib/search/index.py Thu Aug 18 11:07:57 2011 +0100
+++ b/ckan/lib/search/index.py Thu Aug 18 13:41:00 2011 +0100
@@ -1,7 +1,7 @@
from pylons import config
import itertools
import string
-from solr import SolrConnection # == solrpy
+from common import make_connection
import logging
log = logging.getLogger(__name__)
@@ -19,16 +19,6 @@
(u'child_of', u'parent_of'),
]
-def make_connection(config):
- url = config.get('solr_url', 'http://localhost:8983/solr')
- user = config.get('solr_user')
- password = config.get('solr_password')
-
- if user is not None and password is not None:
- return SolrConnection(url, http_user=user, http_pass=password)
- else:
- return SolrConnection(url)
-
def clear_index(config):
conn = make_connection(config)
query = "+site_id:\"%s\"" % (config.get('ckan.site_id'))
http://bitbucket.org/okfn/ckan/changeset/bb0c85b88b44/
changeset: bb0c85b88b44
branch: feature-1275-solr-search
user: John Glover
date: 2011-08-18 14:42:11
summary: [solr] update synchronous search plugin entry point
affected #: 1 file (7 bytes)
--- a/setup.py Thu Aug 18 13:41:00 2011 +0100
+++ b/setup.py Thu Aug 18 13:42:11 2011 +0100
@@ -84,7 +84,7 @@
solr = ckan.lib.search.solr_backend:SolrSearchBackend
[ckan.plugins]
- synchronous_search = ckan.lib.search.worker:SynchronousSearchPlugin
+ synchronous_search = ckan.lib.search:SynchronousSearchPlugin
[ckan.system_plugins]
domain_object_mods = ckan.model.modification:DomainObjectModificationExtension
http://bitbucket.org/okfn/ckan/changeset/a522091287c7/
changeset: a522091287c7
branch: feature-1275-solr-search
user: John Glover
date: 2011-08-18 14:42:59
summary: [solr] move all search query code to query.py, remove references to get_backend()
affected #: 7 files (9.2 KB)
--- a/ckan/lib/search/__init__.py Thu Aug 18 13:42:11 2011 +0100
+++ b/ckan/lib/search/__init__.py Thu Aug 18 13:42:59 2011 +0100
@@ -1,13 +1,11 @@
import logging
-import pkg_resources
-from pylons import config
from ckan import model
from ckan.model import DomainObjectOperation
from ckan.plugins import SingletonPlugin, implements, IDomainObjectModification
from ckan.lib.dictization.model_dictize import package_to_api1
-from common import QueryOptions, SearchQuery, SearchBackend, SearchIndex
-# from solr_backend import SolrSearchBackend
+from common import SearchError
from index import PackageSearchIndex, NoopSearchIndex
+from query import PackageSearchQuery, QueryOptions
log = logging.getLogger(__name__)
@@ -29,6 +27,10 @@
'package': PackageSearchIndex
}
+_QUERIES = {
+ 'package': PackageSearchQuery
+}
+
def _normalize_type(_type):
if isinstance(_type, model.DomainObject):
_type = _type.__class__
@@ -45,9 +47,13 @@
log.warn("Unknown search type: %s" % _type)
return NoopSearchIndex()
-def query_for(_type):
- """ Query for entities of a specified type (name, class, instance). """
- raise Exception("NotYetImplemented")
+def query_for( _type):
+ """ Get a SearchQuery instance sub-class suitable for the specified type. """
+ try:
+ _type_n = _normalize_type(_type)
+ return _QUERIES[_type_n]()
+ except KeyError, ke:
+ raise SearchError("Unknown search type: %s" % _type)
def dispatch_by_operation(entity_type, entity, operation):
"""Call the appropriate index method for a given notification."""
@@ -64,8 +70,6 @@
except Exception, ex:
log.exception(ex)
-class SearchError(Exception): pass
-
class SynchronousSearchPlugin(SingletonPlugin):
"""Update the search index automatically."""
implements(IDomainObjectModification, inherit=True)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ckan/lib/search/query.py Thu Aug 18 13:42:59 2011 +0100
@@ -0,0 +1,248 @@
+from pylons import config
+from paste.util.multidict import MultiDict
+from paste.deploy.converters import asbool
+from ckan import model
+from ckan.authz import Authorizer
+from common import make_connection, SearchError
+import logging
+log = logging.getLogger(__name__)
+
+_open_licenses = None
+
+class QueryOptions(dict):
+ """
+ Options specify aspects of the search query which are only tangentially related
+ to the query terms (such as limits, etc.).
+ """
+
+ BOOLEAN_OPTIONS = ['filter_by_downloadable', 'filter_by_openness', 'all_fields']
+ INTEGER_OPTIONS = ['offset', 'limit']
+
+ def __init__(self, **kwargs):
+ from ckan.lib.search import DEFAULT_OPTIONS
+
+ # set values according to the defaults
+ for option_name, default_value in DEFAULT_OPTIONS.items():
+ if not option_name in self:
+ self[option_name] = default_value
+
+ super(QueryOptions, self).__init__(**kwargs)
+
+ def validate(self):
+ for key, value in self.items():
+ if key in self.BOOLEAN_OPTIONS:
+ try:
+ value = asbool(value)
+ except ValueError:
+ raise SearchError('Value for search option %r must be True or False (1 or 0) but received %r' % (key, value))
+ elif key in self.INTEGER_OPTIONS:
+ try:
+ value = int(value)
+ except ValueError:
+ raise SearchError('Value for search option %r must be an integer but received %r' % (key, value))
+ self[key] = value
+
+ def __getattr__(self, name):
+ return self.get(name)
+
+ def __setattr__(self, name, value):
+ self[name] = value
+
+
+class QueryParser(object):
+ """
+ The query parser will take any incoming query specifications and turn
+ them into field-specific and general query parts.
+ """
+
+ def __init__(self, query, terms, fields):
+ self._query = query
+ self._terms = terms
+ self._fields = MultiDict(fields)
+
+ @property
+ def query(self):
+ if not hasattr(self, '_combined_query'):
+ parts = [self._query if self._query is not None else '']
+
+ for term in self._terms:
+ if term.find(u' ') != -1:
+ term = u"\"%s\"" % term
+ parts.append(term.strip())
+
+ for field, value in self._fields.items():
+ if value.find(' ') != -1:
+ value = u"\"%s\"" % value
+ parts.append(u"%s:%s" % (field.strip(), value.strip()))
+
+ self._combined_query = u' '.join(parts)
+ return self._combined_query
+
+ def _query_tokens(self):
+ """ Split the query string, leaving quoted strings intact. """
+ if self._query:
+ inside_quote = False
+ buf = u''
+ for ch in self._query:
+ if ch == u' ' and not inside_quote:
+ if len(buf):
+ yield buf.strip()
+ buf = u''
+ elif ch == inside_quote:
+ inside_quote = False
+ elif ch in [u"\"", u"'"]:
+ inside_quote = ch
+ else:
+ buf += ch
+ if len(buf):
+ yield buf.strip()
+
+ def _parse_query(self):
+ """ Decompose the query string into fields and terms. """
+ self._combined_fields = MultiDict(self._fields)
+ self._combined_terms = list(self._terms)
+ for token in self._query_tokens():
+ colon_pos = token.find(u':')
+ if colon_pos != -1:
+ field = token[:colon_pos]
+ value = token[colon_pos+1:]
+ value = value.strip('"').strip("'").strip()
+ self._combined_fields.add(field, value)
+ else:
+ self._combined_terms.append(token)
+
+ @property
+ def fields(self):
+ if not hasattr(self, '_combined_fields'):
+ self._parse_query()
+ return self._combined_fields
+
+ @property
+ def terms(self):
+ if not hasattr(self, '_combined_terms'):
+ self._parse_query()
+ return self._combined_terms
+
+ def validate(self):
+ """ Check that this is a valid query. """
+ pass
+
+ def __str__(self):
+ return self.query
+
+ def __repr__(self):
+ return "Query(%r)" % self.query
+
+
+class SearchQuery(object):
+ """
+ A query is ... when you ask the search engine things. SearchQuery is intended
+ to be used for only one query, i.e. it sets state. Definitely not thread-safe.
+ """
+
+ def __init__(self):
+ self.results = []
+ self.count = 0
+
+ @property
+ def open_licenses(self):
+ # this isn't exactly the very best place to put these, but they stay
+ # there persistently.
+ # TODO: figure out if they change during run-time.
+ global _open_licenses
+ if not isinstance(_open_licenses, list):
+ _open_licenses = []
+ for license in model.Package.get_license_register().values():
+ if license and license.isopen():
+ _open_licenses.append(license.id)
+ return _open_licenses
+
+ def _format_results(self):
+ if not self.options.return_objects and len(self.results):
+ if self.options.all_fields:
+ self.results = [r.as_dict() for r in self.results]
+ else:
+ attr_name = self.options.ref_entity_with_attr
+ self.results = [getattr(entity, attr_name) for entity in self.results]
+
+ def run(self, query=None, terms=[], fields={}, facet_by=[], options=None, **kwargs):
+ if options is None:
+ options = QueryOptions(**kwargs)
+ else:
+ options.update(kwargs)
+ self.options = options
+ self.options.validate()
+ self.facet_by = facet_by
+ self.facets = dict()
+ self.query = QueryParser(query, terms, fields)
+ self.query.validate()
+ self._run()
+ self._format_results()
+ return {'results': self.results, 'count': self.count}
+
+ def _run(self):
+ raise SearchError("SearchQuery._run() not implemented!")
+
+ # convenience, allows to query(..)
+ __call__ = run
+
+
+class PackageSearchQuery(SearchQuery):
+ def _run(self):
+ fq = ""
+
+ # Filter for options
+ if self.options.filter_by_downloadable:
+ fq += u" +res_url:[* TO *] " # not null resource URL
+ if self.options.filter_by_openness:
+ licenses = ["license_id:%s" % id for id in self.open_licenses]
+ licenses = " OR ".join(licenses)
+ fq += " +(%s) " % licenses
+
+ order_by = self.options.order_by
+ if order_by == 'rank' or order_by is None:
+ order_by = 'score'
+
+ # show only results from this CKAN instance:
+ fq = fq + " +site_id:\"%s\" " % config.get('ckan.site_id')
+
+ # Filter for package status
+ fq += "+state:active "
+
+ # configurable for iati: full options list
+ facet_limit = int(config.get('search.facets.limit', '50'))
+
+ # query
+ query = self.query.query
+ if (not query) or (not query.strip()):
+ # no query terms, i.e. all documents
+ query = '*:*'
+
+ conn = make_connection(config)
+ try:
+ data = conn.query(query,
+ fq=fq,
+ # make sure data.facet_counts is set:
+ facet='true',
+ facet_limit=facet_limit,
+ facet_field=self.facet_by,
+ facet_mincount=1,
+ start=self.options.offset,
+ rows=self.options.limit,
+ fields='id,score',
+ sort_order='desc',
+ sort=order_by)
+
+ except Exception, e:
+ # this wrapping will be caught further up in the WUI.
+ log.exception(e)
+ raise SearchError(e)
+ finally:
+ conn.close()
+
+ self.count = int(data.numFound)
+ scores = dict([(r.get('id'), r.get('score')) for r in data.results])
+ q = Authorizer().authorized_query(self.options.username, model.Package)
+ q = q.filter(model.Package.id.in_(scores.keys()))
+ self.facets = data.facet_counts.get('facet_fields', {})
+ self.results = sorted(q, key=lambda r: scores[r.id], reverse=True)
--- a/ckan/lib/search/solr_backend.py Thu Aug 18 13:42:11 2011 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,90 +0,0 @@
-from pylons import config
-from ckan.lib.search import SearchBackend, SearchQuery, SearchIndex, \
- SearchError
-from ckan.authz import Authorizer
-from ckan import model
-from solr_indexing import make_connection, index_package, delete_package, \
- clear_index
-import logging
-log = logging.getLogger(__name__)
-
-
-class SolrSearchBackend(SearchBackend):
-
- def _setup(self):
- self.register(model.Package.__name__, PackageSolrSearchIndex, PackageSolrSearchQuery)
-
-class PackageSolrSearchQuery(SearchQuery):
-
- def _run(self):
- fq = ""
-
- # Filter for options
- if self.options.filter_by_downloadable:
- fq += u" +res_url:[* TO *] " # not null resource URL
- if self.options.filter_by_openness:
- licenses = ["license_id:%s" % id for id in self.open_licenses]
- licenses = " OR ".join(licenses)
- fq += " +(%s) " % licenses
-
- order_by = self.options.order_by
- if order_by == 'rank' or order_by is None:
- order_by = 'score'
-
- # show only results from this CKAN instance:
- fq = fq + " +site_id:\"%s\" " % config.get('ckan.site_id')
-
- # Filter for package status
- fq += "+state:active "
-
- # configurable for iati: full options list
- facet_limit = int(config.get('search.facets.limit', '50'))
-
- # query
- query = self.query.query
- if (not query) or (not query.strip()):
- # no query terms, i.e. all documents
- query = '*:*'
-
- conn = make_connection(config)
- try:
- data = conn.query(query,
- fq=fq,
- # make sure data.facet_counts is set:
- facet='true',
- facet_limit=facet_limit,
- facet_field=self.facet_by,
- facet_mincount=1,
- start=self.options.offset,
- rows=self.options.limit,
- fields='id,score',
- sort_order='desc',
- sort=order_by)
-
- except Exception, e:
- # this wrapping will be caught further up in the WUI.
- log.exception(e)
- raise SearchError(e)
- finally:
- conn.close()
-
- self.count = int(data.numFound)
- scores = dict([(r.get('id'), r.get('score')) for r in data.results])
- q = Authorizer().authorized_query(self.options.username, model.Package)
- q = q.filter(model.Package.id.in_(scores.keys()))
- self.facets = data.facet_counts.get('facet_fields', {})
- self.results = sorted(q, key=lambda r: scores[r.id], reverse=True)
-
-
-class SolrSearchIndex(SearchIndex):
-
- def clear(self):
- clear_index(config)
-
-class PackageSolrSearchIndex(SolrSearchIndex):
-
- def remove_dict(self, pkg_dict):
- delete_package(pkg_dict, config)
-
- def update_dict(self, pkg_dict):
- index_package(pkg_dict, config)
--- a/ckan/lib/search/solr_indexing.py Thu Aug 18 13:42:11 2011 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,113 +0,0 @@
-import itertools
-import string
-from solr import SolrConnection # == solrpy
-import logging
-log = logging.getLogger(__name__)
-
-TYPE_FIELD = "entity_type"
-PACKAGE_TYPE = "package"
-KEY_CHARS = string.digits + string.letters + "_-"
-
-SOLR_FIELDS = [TYPE_FIELD, "res_url", "text", "urls", "indexed_ts", "site_id"]
-
-RESERVED_FIELDS = SOLR_FIELDS + ["tags", "groups", "res_description",
- "res_format", "res_url"]
-
-# HACK: this is copied over from model.PackageRelationship
-RELATIONSHIP_TYPES = [(u'depends_on', u'dependency_of'),
- (u'derives_from', u'has_derivation'),
- (u'links_to', u'linked_from'),
- (u'child_of', u'parent_of'),
- ]
-
-def make_connection(config):
- url = config.get('solr_url', 'http://localhost:8983/solr')
- user = config.get('solr_user')
- password = config.get('solr_password')
-
- if user is not None and password is not None:
- return SolrConnection(url, http_user=user, http_pass=password)
- else:
- return SolrConnection(url)
-
-
-def index_package(pkg_dict, config):
- if pkg_dict is None:
- return
- if (not pkg_dict.get('state')) or ('active' not in pkg_dict.get('state')):
- return delete_package(pkg_dict, config)
- conn = make_connection(config)
- index_fields = RESERVED_FIELDS + pkg_dict.keys()
-
- # include the extras in the main namespace
- extras = pkg_dict.get('extras', {})
- for (key, value) in extras.items():
- if isinstance(value, (tuple, list)):
- value = " ".join(map(unicode, value))
- key = ''.join([c for c in key if c in KEY_CHARS])
- pkg_dict['extras_' + key] = value
- if key not in index_fields:
- pkg_dict[key] = value
- if 'extras' in pkg_dict:
- del pkg_dict['extras']
-
- # flatten the structure for indexing:
- for resource in pkg_dict.get('resources', []):
- for (okey, nkey) in [('description', 'res_description'),
- ('format', 'res_format'),
- ('url', 'res_url')]:
- pkg_dict[nkey] = pkg_dict.get(nkey, []) + [resource.get(okey, u'')]
- if 'resources' in pkg_dict:
- del pkg_dict['resources']
-
- # index relationships as <type>:<object>
- rel_dict = {}
- rel_types = list(itertools.chain(RELATIONSHIP_TYPES))
- for rel in pkg_dict.get('relationships', []):
- _type = rel.get('type', 'rel')
- if (_type in pkg_dict.keys()) or (_type not in rel_types):
- continue
- rel_dict[_type] = rel_dict.get(_type, []) + [rel.get('object')]
-
- pkg_dict.update(rel_dict)
-
- if 'relationships' in pkg_dict:
- del pkg_dict['relationships']
-
- pkg_dict[TYPE_FIELD] = PACKAGE_TYPE
- pkg_dict = dict([(k.encode('ascii', 'ignore'), v) for (k, v) in pkg_dict.items()])
-
- # mark this CKAN instance as data source:
- pkg_dict['site_id'] = config.get('ckan.site_id')
-
- # send to solr:
- try:
- conn.add_many([pkg_dict])
- conn.commit(wait_flush=False, wait_searcher=False)
- finally:
- conn.close()
-
- log.debug("Updated index for %s" % pkg_dict.get('name'))
-
-
-def delete_package(pkg_dict, config):
- conn = make_connection(config)
- query = "+%s:%s +id:\"%s\" +site_id:\"%s\"" % (TYPE_FIELD, PACKAGE_TYPE,
- pkg_dict.get('id'),
- config.get('ckan.site_id'))
- try:
- conn.delete_query(query)
- conn.commit()
- finally:
- conn.close()
-
-
-def clear_index(config):
- conn = make_connection(config)
- query = "+site_id:\"%s\"" % (config.get('ckan.site_id'))
- try:
- conn.delete_query(query)
- conn.commit()
- finally:
- conn.close()
-
--- a/ckan/lib/search/worker.py Thu Aug 18 13:42:11 2011 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,63 +0,0 @@
-import logging
-
-import ckan.model as model
-from ckan.model import DomainObjectOperation
-from ckan.plugins import SingletonPlugin, implements, IDomainObjectModification
-from ckan.lib.dictization.model_dictize import package_to_api1
-# Needed for SolrIndexingWorker:
-# from ckanext.queue.worker import Worker
-# from indexing import index_package, delete_package
-
-log = logging.getLogger(__name__)
-
-
-def dispatch_by_operation(entity_type, entity, operation, backend=None):
- """ Call the appropriate index method for a given notification. """
- if backend is None:
- from ckan.lib.search import get_backend
- backend = get_backend()
- try:
- index = backend.index_for(entity_type)
- if operation == DomainObjectOperation.new:
- index.insert_dict(entity)
- elif operation == DomainObjectOperation.changed:
- index.update_dict(entity)
- elif operation == DomainObjectOperation.deleted:
- index.remove_dict(entity)
- else:
- log.warn("Unknown operation: %s" % operation)
- except Exception, ex:
- log.exception(ex)
-
-
-class SynchronousSearchPlugin(SingletonPlugin):
-
- implements(IDomainObjectModification, inherit=True)
-
- def notify(self, entity, operation):
-
- if operation != DomainObjectOperation.deleted:
- dispatch_by_operation(entity.__class__.__name__,
- package_to_api1(entity, {'model': model}),
- operation)
- elif operation == DomainObjectOperation.deleted:
- dispatch_by_operation(entity.__class__.__name__,
- {'id': entity.id}, operation)
- else:
- log.warn("Discarded Sync. indexing for: %s" % entity)
-
-
-# class SolrIndexingWorker(Worker):
-# """
-# SolrIndexingWorker. Requires ckanext-queue >= 0.1.
-# """
-
-# def consume(self, routing_key, operation, payload):
-# assert 'solr_url' in self.config
-# assert 'ckan.site_id' in self.config
-
-# if routing_key == 'Package':
-# if operation in ['new', 'changed']:
-# index_package(payload, self.config)
-# elif operation == 'deleted':
-# delete_package(payload, self.config)
--- a/ckan/tests/lib/test_solr_package_search.py Thu Aug 18 13:42:11 2011 +0100
+++ b/ckan/tests/lib/test_solr_package_search.py Thu Aug 18 13:42:59 2011 +0100
@@ -1,4 +1,3 @@
-from pylons import config
from ckan.tests import TestController, CreateTestData
from ckan import model
import ckan.lib.search as search
@@ -20,15 +19,12 @@
idx = [t.name for t in gils.tags].index(cls.tagname)
del gils.tags[idx]
model.repo.commit_and_remove()
- # solr
- config['search_backend'] = 'solr'
search.rebuild()
- cls.backend = search.get_backend()
@classmethod
def teardown_class(cls):
model.repo.rebuild_db()
- search.get_backend().index_for('Package').clear()
+ search.index_for('Package').clear()
def _pkg_names(self, result):
return ' '.join(result['results'])
@@ -41,91 +37,91 @@
return True
def test_1_all_records(self):
- result = self.backend.query_for(model.Package).run(query=self.q_all)
+ result = search.query_for(model.Package).run(query=self.q_all)
assert 'gils' in result['results'], result['results']
assert result['count'] == 6, result['count']
def test_1_name(self):
# exact name
- result = self.backend.query_for(model.Package).run(query=u'gils')
+ result = search.query_for(model.Package).run(query=u'gils')
assert result['count'] == 1, result
assert self._pkg_names(result) == 'gils', result
def test_1_name_multiple_results(self):
- result = self.backend.query_for(model.Package).run(query=u'gov')
+ result = search.query_for(model.Package).run(query=u'gov')
assert self._check_entity_names(result, ('us-gov-images', 'usa-courts-gov')), self._pkg_names(result)
assert result['count'] == 4, self._pkg_names(result)
def test_1_name_token(self):
- result = self.backend.query_for(model.Package).run(query=u'name:gils')
+ result = search.query_for(model.Package).run(query=u'name:gils')
assert self._pkg_names(result) == 'gils', self._pkg_names(result)
- result = self.backend.query_for(model.Package).run(query=u'title:gils')
+ result = search.query_for(model.Package).run(query=u'title:gils')
assert not self._check_entity_names(result, ('gils')), self._pkg_names(result)
def test_2_title(self):
# exact title, one word
- result = self.backend.query_for(model.Package).run(query=u'Opengov.se')
+ result = search.query_for(model.Package).run(query=u'Opengov.se')
assert self._pkg_names(result) == 'se-opengov', self._pkg_names(result)
# multiple words
- result = self.backend.query_for(model.Package).run(query=u'Government Expenditure')
+ result = search.query_for(model.Package).run(query=u'Government Expenditure')
assert self._pkg_names(result) == 'uk-government-expenditure', self._pkg_names(result)
# multiple words wrong order
- result = self.backend.query_for(model.Package).run(query=u'Expenditure Government')
+ result = search.query_for(model.Package).run(query=u'Expenditure Government')
assert self._pkg_names(result) == 'uk-government-expenditure', self._pkg_names(result)
# multiple words, one doesn't match
- result = self.backend.query_for(model.Package).run(query=u'Expenditure Government China')
+ result = search.query_for(model.Package).run(query=u'Expenditure Government China')
assert len(result['results']) == 0, self._pkg_names(result)
def test_3_licence(self):
# this should result, but it is here to check that at least it does not error
- result = self.backend.query_for(model.Package).run(query=u'license:"OKD::Other (PublicsDomain)"')
+ result = search.query_for(model.Package).run(query=u'license:"OKD::Other (PublicsDomain)"')
assert result['count'] == 0, result
def test_quotation(self):
# multiple words quoted
- result = self.backend.query_for(model.Package).run(query=u'"Government Expenditure"')
+ result = search.query_for(model.Package).run(query=u'"Government Expenditure"')
assert self._pkg_names(result) == 'uk-government-expenditure', self._pkg_names(result)
# multiple words quoted wrong order
- result = self.backend.query_for(model.Package).run(query=u'"Expenditure Government"')
+ result = search.query_for(model.Package).run(query=u'"Expenditure Government"')
assert self._pkg_names(result) == '', self._pkg_names(result)
def test_string_not_found(self):
- result = self.backend.query_for(model.Package).run(query=u'randomthing')
+ result = search.query_for(model.Package).run(query=u'randomthing')
assert self._pkg_names(result) == '', self._pkg_names(result)
def test_tags_field(self):
- result = self.backend.query_for(model.Package).run(query=u'country-sweden')
+ result = search.query_for(model.Package).run(query=u'country-sweden')
assert self._check_entity_names(result, ['se-publications', 'se-opengov']), self._pkg_names(result)
def test_tags_token_simple(self):
- result = self.backend.query_for(model.Package).run(query=u'tags:country-sweden')
+ result = search.query_for(model.Package).run(query=u'tags:country-sweden')
assert self._check_entity_names(result, ['se-publications', 'se-opengov']), self._pkg_names(result)
- result = self.backend.query_for(model.Package).run(query=u'tags:wildlife')
+ result = search.query_for(model.Package).run(query=u'tags:wildlife')
assert self._pkg_names(result) == 'us-gov-images', self._pkg_names(result)
def test_tags_token_simple_with_deleted_tag(self):
# registry has been deleted
- result = self.backend.query_for(model.Package).run(query=u'tags:registry')
+ result = search.query_for(model.Package).run(query=u'tags:registry')
assert self._pkg_names(result) == '', self._pkg_names(result)
def test_tags_token_multiple(self):
- result = self.backend.query_for(model.Package).run(query=u'tags:country-sweden tags:format-pdf')
+ result = search.query_for(model.Package).run(query=u'tags:country-sweden tags:format-pdf')
assert self._pkg_names(result) == 'se-publications', self._pkg_names(result)
def test_tags_token_complicated(self):
- result = self.backend.query_for(model.Package).run(query=u'tags:country-sweden tags:somethingrandom')
+ result = search.query_for(model.Package).run(query=u'tags:country-sweden tags:somethingrandom')
assert self._pkg_names(result) == '', self._pkg_names(result)
def test_pagination(self):
# large search
- all_results = self.backend.query_for(model.Package).run(query=self.q_all)
+ all_results = search.query_for(model.Package).run(query=self.q_all)
all_pkgs = all_results['results']
all_pkg_count = all_results['count']
# limit
options = search.QueryOptions()
options.limit = 2
- result = self.backend.query_for(model.Package).run(query=self.q_all, options=options)
+ result = search.query_for(model.Package).run(query=self.q_all, options=options)
pkgs = result['results']
count = result['count']
assert len(pkgs) == 2, pkgs
@@ -136,7 +132,7 @@
options = search.QueryOptions()
options.limit = 2
options.offset = 2
- result = self.backend.query_for(model.Package).run(query=self.q_all, options=options)
+ result = search.query_for(model.Package).run(query=self.q_all, options=options)
pkgs = result['results']
assert len(pkgs) == 2, pkgs
assert pkgs == all_pkgs[2:4]
@@ -145,14 +141,14 @@
options = search.QueryOptions()
options.limit = 2
options.offset = 4
- result = self.backend.query_for(model.Package).run(query=self.q_all, options=options)
+ result = search.query_for(model.Package).run(query=self.q_all, options=options)
pkgs = result['results']
assert len(pkgs) == 2, pkgs
assert pkgs == all_pkgs[4:6]
def test_order_by(self):
# large search
- all_results = self.backend.query_for(model.Package).run(query=self.q_all)
+ all_results = search.query_for(model.Package).run(query=self.q_all)
all_pkgs = all_results['results']
all_pkg_count = all_results['count']
@@ -160,7 +156,7 @@
# TODO: fix this test
# options = search.QueryOptions()
# options.order_by = 'rank'
- # result = self.backend.query_for(model.Package).run(query='penguin', options=options)
+ # result = search.query_for(model.Package).run(query='penguin', options=options)
# pkgs = result['results']
# fields = [model.Package.by_name(pkg_name).name for pkg_name in pkgs]
# assert fields[0] == 'usa-courts-gov', fields # has penguin three times
@@ -169,7 +165,7 @@
# name
options = search.QueryOptions()
options.order_by = 'name'
- result = self.backend.query_for(model.Package).run(query=self.q_all, options=options)
+ result = search.query_for(model.Package).run(query=self.q_all, options=options)
pkgs = result['results']
fields = [model.Package.by_name(pkg_name).name for pkg_name in pkgs]
sorted_fields = fields; sorted_fields.sort()
@@ -178,7 +174,7 @@
# title
options = search.QueryOptions()
options.order_by = 'title'
- result = self.backend.query_for(model.Package).run(query=self.q_all, options=options)
+ result = search.query_for(model.Package).run(query=self.q_all, options=options)
pkgs = result['results']
fields = [model.Package.by_name(pkg_name).title for pkg_name in pkgs]
sorted_fields = fields; sorted_fields.sort()
@@ -187,7 +183,7 @@
# notes
options = search.QueryOptions()
options.order_by = 'notes'
- result = self.backend.query_for(model.Package).run(query=self.q_all, options=options)
+ result = search.query_for(model.Package).run(query=self.q_all, options=options)
pkgs = result['results']
fields = [model.Package.by_name(pkg_name).notes for pkg_name in pkgs]
sorted_fields = fields; sorted_fields.sort()
@@ -196,7 +192,7 @@
# extra field
options = search.QueryOptions()
options.order_by = 'date_released'
- result = self.backend.query_for(model.Package).run(query=self.q_all, options=options)
+ result = search.query_for(model.Package).run(query=self.q_all, options=options)
pkgs = result['results']
fields = [model.Package.by_name(pkg_name) for pkg_name in pkgs]
fields = [field.extras.get('date_released') for field in fields]
@@ -204,45 +200,43 @@
assert fields == sorted_fields, repr(fields) + repr(sorted_fields)
def test_search_notes_on(self):
- result = self.backend.query_for(model.Package).run(query=u'restrictions')
+ result = search.query_for(model.Package).run(query=u'restrictions')
pkgs = result['results']
count = result['count']
assert len(pkgs) == 2, pkgs
def test_search_foreign_chars(self):
- result = self.backend.query_for(model.Package).run(query='umlaut')
+ result = search.query_for(model.Package).run(query='umlaut')
assert result['results'] == ['gils'], result['results']
- result = self.backend.query_for(model.Package).run(query=u'thumb')
+ result = search.query_for(model.Package).run(query=u'thumb')
assert result['count'] == 0, result['results']
- result = self.backend.query_for(model.Package).run(query=u'th\xfcmb')
+ result = search.query_for(model.Package).run(query=u'th\xfcmb')
assert result['results'] == ['gils'], result['results']
def test_groups(self):
- result = self.backend.query_for(model.Package).run(query=u'groups:random')
+ result = search.query_for(model.Package).run(query=u'groups:random')
assert self._pkg_names(result) == '', self._pkg_names(result)
- result = self.backend.query_for(model.Package).run(query=u'groups:ukgov')
+ result = search.query_for(model.Package).run(query=u'groups:ukgov')
assert result['count'] == 4, self._pkg_names(result)
- result = self.backend.query_for(model.Package).run(query=u'groups:ukgov tags:us')
+ result = search.query_for(model.Package).run(query=u'groups:ukgov tags:us')
assert result['count'] == 2, self._pkg_names(result)
class TestSearchOverall(TestController):
@classmethod
def setup_class(cls):
CreateTestData.create()
- config['search_backend'] = 'solr'
search.rebuild()
- cls.backend = search.get_backend()
@classmethod
def teardown_class(cls):
model.repo.rebuild_db()
- search.get_backend().index_for('Package').clear()
+ search.index_for('Package').clear()
def _check_search_results(self, terms, expected_count, expected_packages=[], only_open=False, only_downloadable=False):
options = search.QueryOptions()
options.filter_by_openness = only_open
options.filter_by_downloadable = only_downloadable
- result = self.backend.query_for(model.Package).run(query=unicode(terms))
+ result = search.query_for(model.Package).run(query=unicode(terms))
pkgs = result['results']
count = result['count']
assert count == expected_count, (count, expected_count)
@@ -281,19 +275,17 @@
'extras':{'geographic_coverage':'000000:'},},
]
CreateTestData.create_arbitrary(init_data)
- config['search_backend'] = 'solr'
search.rebuild()
- cls.backend = search.get_backend()
@classmethod
def teardown_class(self):
model.repo.rebuild_db()
- search.get_backend().index_for('Package').clear()
+ search.index_for('Package').clear()
def _do_search(self, q, expected_pkgs, count=None):
options = search.QueryOptions()
options.order_by = 'rank'
- result = self.backend.query_for(model.Package).run(query=q, options=options)
+ result = search.query_for(model.Package).run(query=q, options=options)
pkgs = result['results']
fields = [model.Package.by_name(pkg_name).name for pkg_name in pkgs]
if not (count is None):
@@ -304,7 +296,7 @@
def _filtered_search(self, value, expected_pkgs, count=None):
options = search.QueryOptions()
options.order_by = 'rank'
- result = self.backend.query_for(model.Package).run(fields={'geographic_coverage':value}, options=options)
+ result = search.query_for(model.Package).run(fields={'geographic_coverage':value}, options=options)
pkgs = result['results']
fields = [model.Package.by_name(pkg_name).name for pkg_name in pkgs]
if not (count is None):
@@ -318,11 +310,14 @@
self._do_search(u'united kingdom', ['uk'], 1)
self._do_search(u'great britain', ['gb'], 1)
- # TODO: solr is not currently set up to allow partial matches
- # and extras are not saved as multivalued so this
- # test will fail. Make multivalued or remove?
- # def test_1_filtered(self):
- # self._filtered_search(u'england', ['eng', 'eng_ni', 'uk', 'gb'], 4)
+ def test_1_filtered(self):
+ # TODO: solr is not currently set up to allow partial matches
+ # and extras are not saved as multivalued so this
+ # test will fail. Make multivalued or remove?
+ from ckan.tests import SkipTest
+ raise SkipTest
+
+ self._filtered_search(u'england', ['eng', 'eng_ni', 'uk', 'gb'], 4)
class TestExtraFields(TestController):
@classmethod
@@ -340,17 +335,15 @@
'extras':{'department':''},},
]
CreateTestData.create_arbitrary(init_data)
- config['search_backend'] = 'solr'
search.rebuild()
- cls.backend = search.get_backend()
@classmethod
def teardown_class(self):
model.repo.rebuild_db()
- search.get_backend().index_for('Package').clear()
+ search.index_for('Package').clear()
def _do_search(self, department, expected_pkgs, count=None):
- result = self.backend.query_for(model.Package).run(fields={'department': department})
+ result = search.query_for(model.Package).run(fields={'department': department})
pkgs = result['results']
fields = [model.Package.by_name(pkg_name).name for pkg_name in pkgs]
if not (count is None):
@@ -361,12 +354,17 @@
def test_0_basic(self):
self._do_search(u'bcd', 'b', 1)
self._do_search(u'cde abc', 'c', 1)
+
+ def test_1_partial_matches(self):
# TODO: solr is not currently set up to allow partial matches
# and extras are not saved as multivalued so these
# tests will fail. Make multivalued or remove these?
- # self._do_search(u'abc', ['a', 'c'], 2)
- # self._do_search(u'cde', 'c', 1)
- # self._do_search(u'abc cde', 'c', 1)
+ from ckan.tests import SkipTest
+ raise SkipTest
+
+ self._do_search(u'abc', ['a', 'c'], 2)
+ self._do_search(u'cde', 'c', 1)
+ self._do_search(u'abc cde', 'c', 1)
class TestRank(TestController):
@classmethod
@@ -381,19 +379,17 @@
u'test1-penguin-canary',
u'test2-squirrel-squirrel-canary-goose'
]
- config['search_backend'] = 'solr'
search.rebuild()
- cls.backend = search.get_backend()
@classmethod
def teardown_class(self):
model.repo.rebuild_db()
- search.get_backend().index_for('Package').clear()
+ search.index_for('Package').clear()
def _do_search(self, q, wanted_results):
options = search.QueryOptions()
options.order_by = 'rank'
- result = self.backend.query_for(model.Package).run(query=q, options=options)
+ result = search.query_for(model.Package).run(query=q, options=options)
results = result['results']
err = 'Wanted %r, got %r' % (wanted_results, results)
assert wanted_results[0] == results[0], err
@@ -404,7 +400,10 @@
self._do_search(u'squirrel', self.pkg_names[::-1])
self._do_search(u'canary', self.pkg_names)
- # TODO: fix this test
- # def test_1_weighting(self):
- # self._do_search(u'penguin', self.pkg_names)
- # self._do_search(u'goose', self.pkg_names[::-1])
+ def test_1_weighting(self):
+ # TODO: fix this test
+ from ckan.tests import SkipTest
+ raise SkipTest
+
+ self._do_search(u'penguin', self.pkg_names)
+ self._do_search(u'goose', self.pkg_names[::-1])
--- a/ckan/tests/lib/test_solr_package_search_synchronous_update.py Thu Aug 18 13:42:11 2011 +0100
+++ b/ckan/tests/lib/test_solr_package_search_synchronous_update.py Thu Aug 18 13:42:59 2011 +0100
@@ -15,10 +15,8 @@
gc.collect()
CreateTestData.create()
- config['search_backend'] = 'solr'
search.rebuild()
plugins.load('synchronous_search')
- cls.backend = search.get_backend()
cls.new_pkg_dict = {
"name": "council-owned-litter-bins",
@@ -53,7 +51,7 @@
@classmethod
def teardown_class(cls):
model.repo.rebuild_db()
- search.get_backend().index_for('Package').clear()
+ search.index_for('Package').clear()
def _create_package(self, package=None):
rev = model.repo.new_revision()
http://bitbucket.org/okfn/ckan/changeset/e3b5137ef6b5/
changeset: e3b5137ef6b5
branch: feature-1275-solr-search
user: John Glover
date: 2011-08-18 15:02:12
summary: [solr] Remove postgres search
affected #: 5 files (60 bytes)
--- a/ckan/lib/search/sql.py Thu Aug 18 13:42:59 2011 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,304 +0,0 @@
-import logging
-
-import sqlalchemy
-from sqlalchemy.sql import or_
-from sqlalchemy.exceptions import UnboundExecutionError
-
-from common import SearchBackend, SearchQuery, SearchError
-from common import SearchIndex, NoopSearchIndex
-from ckan import model
-from ckan.model import meta
-from ckan import authz
-
-log = logging.getLogger(__name__)
-
-
-class SqlSearchBackend(SearchBackend):
-
- @property
- def connection(self):
- return meta.Session.connection()
-
- def _setup(self):
- self.register(model.Package, PackageSqlSearchIndex, PackageSqlSearchQuery)
- self.register(model.Group, NoopSearchIndex, GroupSqlSearchQuery)
- self.register(model.Tag, NoopSearchIndex, TagSqlSearchQuery)
- self.register(model.Resource, NoopSearchIndex, ResourceSqlSearchQuery)
-
-
-class SqlSearchQuery(SearchQuery):
- """ Common functions for queries against the DB. """
-
- def _db_query(self, q):
- # Run the query
- self.count = q.count()
- q = q.offset(self.options.get('offset'))
- q = q.limit(self.options.get('limit'))
-
- #print q
-
- self.results = []
- for result in q:
- if isinstance(result, tuple) and isinstance(result[0], model.DomainObject):
- # This is the case for order_by rank due to the add_column.
- self.results.append(result[0])
- else:
- self.results.append(result)
-
-
-class GroupSqlSearchQuery(SqlSearchQuery):
- """ Search for groups in plain SQL. """
-
- def _run(self):
- if not self.query.terms:
- return
- q = authz.Authorizer().authorized_query(username, model.Group)
- for term in self.query.terms:
- q = query.filter(model.Group.name.contains(term.lower()))
- self._db_query(q)
-
-
-class TagSqlSearchQuery(SqlSearchQuery):
- """ Search for tags in plain SQL. """
-
- def _run(self):
- q = model.Session.query(model.Tag)
- q = q.distinct().join(model.Tag.package_tags)
- terms = list(self.query.terms)
- for field, value in self.query.fields.items():
- if field in ('tag', 'tags'):
- terms.append(value)
- if not len(terms):
- return
- for term in terms:
- q = q.filter(model.Tag.name.contains(term.lower()))
- self._db_query(q)
-
-
-class ResourceSqlSearchQuery(SqlSearchQuery):
- """ Search for resources in plain SQL. """
-
- def _run(self):
- q = model.Session.query(model.Resource) # TODO authz
- if self.query.terms:
- raise SearchError('Only field specific terms allowed in resource search.')
- #self._check_options_specified_are_allowed('resource search', ['all_fields', 'offset', 'limit'])
- self.options.ref_entity_with_attr = 'id' # has no name
- resource_fields = model.Resource.get_columns()
- for field, terms in self.query.fields.items():
- if isinstance(terms, basestring):
- terms = terms.split()
- if field not in resource_fields:
- raise SearchError('Field "%s" not recognised in Resource search.' % field)
- for term in terms:
- model_attr = getattr(model.Resource, field)
- if field == 'hash':
- q = q.filter(model_attr.ilike(unicode(term) + '%'))
- elif field in model.Resource.get_extra_columns():
- model_attr = getattr(model.Resource, 'extras')
-
- like = or_(model_attr.ilike(u'''%%"%s": "%%%s%%",%%''' % (field, term)),
- model_attr.ilike(u'''%%"%s": "%%%s%%"}''' % (field, term))
- )
- q = q.filter(like)
- else:
- q = q.filter(model_attr.ilike('%' + unicode(term) + '%'))
-
- order_by = self.options.order_by
- if order_by is not None:
- if hasattr(model.Resource, order_by):
- q = q.order_by(getattr(model.Resource, order_by))
- self._db_query(q)
-
-
-class PackageSqlSearchQuery(SqlSearchQuery):
- """ Search for packages using SQL and Postgres' TS full-text search. """
-
- def _run(self):
- q = authz.Authorizer().authorized_query(self.options.get('username'), model.Package)
- make_like = lambda x,y: x.ilike(u'%' + unicode(y) + u'%')
- q = q.filter(model.package_search_table.c.package_id==model.Package.id)
-
- all_terms = ''
- if self.query.query != '*:*':
- # Full search by general terms (and field specific terms but not by field)
- terms_set = set(self.query.terms)
- terms_set.update(self.query.fields.values())
- all_terms = u' '.join(map(unicode, terms_set))
-
- if len(all_terms.strip()):
- q = q.filter(u'package_search.search_vector @@ plainto_tsquery(:terms)')
- q = q.params(terms=all_terms)
-
- # Filter by field specific terms
- for field, terms in self.query.fields.items():
- if field == 'tags':
- q = self._filter_by_tag(q, terms)
- continue
- elif field == 'groups':
- q = self._filter_by_group(q, terms)
- continue
-
- if isinstance(terms, basestring):
- terms = terms.split()
-
- if field in model.package_table.c:
- model_attr = getattr(model.Package, field)
- for term in terms:
- q = q.filter(make_like(model_attr, term))
- else:
- q = self._filter_by_extra(q, field, terms)
-
- # Filter for options
- if self.options.filter_by_downloadable:
- q = q.join('resource_groups_all', 'resources_all', aliased=True)
- q = q.filter(sqlalchemy.and_(
- model.Resource.state==model.State.ACTIVE,
- model.ResourceGroup.package_id==model.Package.id))
- if self.options.filter_by_openness:
- q = q.filter(model.Package.license_id.in_(self.open_licenses))
-
- order_by = self.options.order_by
- if order_by is not None:
- if order_by == 'rank':
- q = q.add_column(sqlalchemy.func.ts_rank_cd(sqlalchemy.text('package_search.search_vector'),
- sqlalchemy.func.plainto_tsquery(all_terms)))
- q = q.order_by(sqlalchemy.text('ts_rank_cd_1 DESC'))
- elif hasattr(model.Package, order_by):
- q = q.order_by(getattr(model.Package, order_by))
- else:
- # TODO extras
- raise NotImplemented
-
- q = q.distinct()
- self._db_query(q)
-
- def _filter_by_tag(self, q, term):
- if not self.options.search_tags:
- return q
- tag = model.Tag.by_name(unicode(term), autoflush=False)
- if tag:
- # need to keep joining for each filter
- # tag should be active hence state_id requirement
- q = q.join('package_tags', aliased=True).filter(sqlalchemy.and_(
- model.PackageTag.state==model.State.ACTIVE,
- model.PackageTag.tag_id==tag.id))
- else:
- # unknown tag, so torpedo search
- q = q.filter(model.PackageTag.tag_id==u'\x130')
- return q
-
- def _filter_by_group(self, q, term):
- group = model.Group.by_name(unicode(term), autoflush=False)
- if group:
- # need to keep joining for each filter
- q = q.join('package_group_all', 'group', aliased=True).filter(
- model.Group.id==group.id)
- else:
- # unknown group, so torpedo search
- q = q.filter(model.Group.id==u'-1')
- return q
-
- def _filter_by_extra(self, q, field, terms):
- make_like = lambda x,y: x.ilike(u'%' + unicode(y) + u'%')
- for term in terms:
- q = q.join('_extras', aliased=True)
- q = q.filter(model.PackageExtra.state==model.State.ACTIVE)
- q = q.filter(model.PackageExtra.key==unicode(field))
- q = q.filter(make_like(model.PackageExtra.value, term))
- return q
-
-
-class SqlSearchIndex(SearchIndex): pass
-
-
-class PackageSqlSearchIndex(SqlSearchIndex):
-
- def _make_vector(self, pkg_dict):
- if isinstance(pkg_dict.get('tags'), (list, tuple)):
- pkg_dict['tags'] = ' '.join(pkg_dict.get('tags', []))
- if isinstance(pkg_dict.get('groups'), (list, tuple)):
- pkg_dict['groups'] = ' '.join(pkg_dict.get('groups', []))
-
- document_a = u' '.join((pkg_dict.get('name') or u'', pkg_dict.get('title') or u''))
- document_b_items = []
- for field_name in ['notes', 'tags', 'groups', 'author', 'maintainer', 'url']:
- val = pkg_dict.get(field_name)
- if val:
- document_b_items.append(val)
- extras = pkg_dict.get('extras', {})
- for key, value in extras.items():
- if value is not None:
- document_b_items.append(unicode(value))
- document_b = u' '.join(document_b_items)
-
- # Create weighted vector
- vector_sql = 'setweight(to_tsvector(%s), \'A\') || setweight(to_tsvector(%s), \'D\')'
- params = [document_a.encode('utf8'), document_b.encode('utf8')]
- return vector_sql, params
-
- def _print_lexemes(self, pkg_dict):
- sql = "SELECT package_id, search_vector FROM package_search WHERE package_id = %s"
- res = self.backend.connection.execute(sql, pkg_dict['id'])
- print res.fetchall()
- res.close()
-
- def _run_sql(self, sql, params):
- conn = self.backend.connection
- tx = conn.begin_nested()
- try:
- res = conn.execute(sql, params)
- results = res.fetchall() if not res.closed else None
- res.close()
- tx.commit()
- except Exception, e:
- tx.rollback()
- raise
- return results
-
- def insert_dict(self, pkg_dict):
- if not 'id' in pkg_dict or not 'name' in pkg_dict:
- return
- vector_sql, params = self._make_vector(pkg_dict)
- sql = "INSERT INTO package_search VALUES (%%s, %s)" % vector_sql
- params = [pkg_dict.get('id')] + params
- self._run_sql(sql, params)
- log.debug("Indexed %s" % pkg_dict.get('name'))
-
- def update_dict(self, pkg_dict):
- if not 'id' in pkg_dict or not 'name' in pkg_dict:
- return
- vector_sql, params = self._make_vector(pkg_dict)
- sql = "UPDATE package_search SET search_vector=%s WHERE package_id=%%s" % vector_sql
- params.append(pkg_dict['id'])
- self._run_sql(sql, params)
- log.debug("Updated index for %s" % pkg_dict.get('name'))
-
- def remove_dict(self, pkg_dict):
- if not 'id' in pkg_dict or not 'name' in pkg_dict:
- return
- sql = "DELETE FROM package_search WHERE package_id=%s"
- self._run_sql(sql, [pkg_dict.get('id')])
- log.debug("Delete entry %s from index" % pkg_dict.get('id'))
-
-
- # This is currently handled by the foreign key constraint on package_id.
- # Once we remove that constraint, manual removal will become necessary.
- pass
-
- def clear(self):
- self._run_sql("DELETE FROM package_search WHERE 1=1", {})
-
- def get_all_entity_ids(self):
- sql = 'SELECT package_id FROM package_search'
- results = self._run_sql(sql, [])
- return [res[0] for res in results]
-
- def get_index(self, pkg_ref):
- pkg = model.Package.get(pkg_ref)
- assert pkg
- sql = "SELECT package_id, search_vector FROM package_search WHERE package_id = %s"
- res = self.backend.connection.execute(sql, pkg.id)
- search_vector = res.fetchall()
- res.close()
- return search_vector
--- a/ckan/tests/lib/test_package_search.py Thu Aug 18 13:42:59 2011 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,451 +0,0 @@
-import time
-
-from ckan.model import Package
-import ckan.lib.search as search
-from ckan.lib.search import get_backend, query_for, QueryOptions
-import ckan.model as model
-from ckan.tests import *
-from ckan.tests import is_search_supported
-from ckan.lib.create_test_data import CreateTestData
-
-class TestSearch(TestController):
- q_all = u'penguin'
-
- @classmethod
- def setup_class(self):
- if not is_search_supported():
- raise SkipTest("Search not supported")
-
- indexer = TestSearchIndexer()
- model.Session.remove()
- CreateTestData.create_search_test_data()
-
- # now remove a tag so we can test search with deleted tags
- model.repo.new_revision()
- gils = model.Package.by_name(u'gils')
- # an existing tag used only by gils
- self.tagname = u'registry'
- # we aren't guaranteed it is last ...
- idx = [ t.name for t in gils.tags].index(self.tagname)
- del gils.tags[idx]
- model.repo.commit_and_remove()
- indexer.index()
-
- self.gils = model.Package.by_name(u'gils')
- self.war = model.Package.by_name(u'warandpeace')
- self.russian = model.Tag.by_name(u'russian')
- self.tolstoy = model.Tag.by_name(u'tolstoy')
-
- self.backend = get_backend(backend='sql')
-
- @classmethod
- def teardown_class(self):
- model.repo.rebuild_db()
-
- def _pkg_names(self, result):
- return ' '.join(result['results'])
-
- def _check_entity_names(self, result, names_in_result):
- names = result['results']
- for name in names_in_result:
- if name not in names:
- return False
- return True
-
- # Can't search for all records in postgres, so search for 'penguin' which
- # we have put in all the records.
- def test_1_all_records(self):
- # all records
- result = self.backend.query_for(model.Package).run(query=self.q_all)
- assert 'gils' in result['results'], result['results']
- assert result['count'] > 5, result['count']
-
- def test_1_name(self):
- # exact name
- result = self.backend.query_for(model.Package).run(query=u'gils')
- assert self._pkg_names(result) == 'gils', result
- assert result['count'] == 1, result
-
- def test_1_name_multiple_results(self):
- result = self.backend.query_for(model.Package).run(query=u'gov')
- assert self._check_entity_names(result, ('us-gov-images', 'usa-courts-gov')), self._pkg_names(result)
- assert result['count'] == 4, self._pkg_names(result)
-
- def test_1_name_token(self):
- result = self.backend.query_for(model.Package).run(query=u'name:gils')
- assert self._pkg_names(result) == 'gils', self._pkg_names(result)
-
- result = self.backend.query_for(model.Package).run(query=u'title:gils')
- assert not self._check_entity_names(result, ('gils')), self._pkg_names(result)
-
- def test_2_title(self):
- # exact title, one word
- result = self.backend.query_for(model.Package).run(query=u'Opengov.se')
- assert self._pkg_names(result) == 'se-opengov', self._pkg_names(result)
-
-## # part word
-## result = Search().search(u'gov.se')
-## assert self._pkg_names(result) == 'se-opengov', self._pkg_names(result)
-
- # multiple words
- result = self.backend.query_for(model.Package).run(query=u'Government Expenditure')
- assert self._pkg_names(result) == 'uk-government-expenditure', self._pkg_names(result)
-
- # multiple words wrong order
- result = self.backend.query_for(model.Package).run(query=u'Expenditure Government')
- assert self._pkg_names(result) == 'uk-government-expenditure', self._pkg_names(result)
-
- # multiple words, one doesn't match
- result = self.backend.query_for(model.Package).run(query=u'Expenditure Government China')
- assert len(result['results']) == 0, self._pkg_names(result)
-
- def test_3_licence(self):
- ## this should result, but it is here to check that at least it does not error
- result = self.backend.query_for(model.Package).run(query=u'license:"OKD::Other (PublicsDomain)"')
- assert result['count'] == 0, result
-
-# Quotation not supported now
-## # multiple words quoted
-## result = Search().search(u'"Government Expenditure"')
-## assert self._pkg_names(result) == 'uk-government-expenditure', self._pkg_names(result)
-
-## # multiple words quoted wrong order
-## result = Search().search(u'Expenditure Government')
-## assert self._pkg_names(result) == '', self._pkg_names(result)
-
- # token
- result = self.backend.query_for(model.Package).run(query=u'title:Opengov.se')
- assert self._pkg_names(result) == 'se-opengov', self._pkg_names(result)
-
- # token
- result = self.backend.query_for(model.Package).run(query=u'name:gils')
- assert self._pkg_names(result) == 'gils', self._pkg_names(result)
-
- # token
- result = self.backend.query_for(model.Package).run(query=u'randomthing')
- assert self._pkg_names(result) == '', self._pkg_names(result)
-
- def test_tags_field(self):
- result = self.backend.query_for(model.Package).run(query=u'country-sweden')
- assert self._check_entity_names(result, ['se-publications', 'se-opengov']), self._pkg_names(result)
-
- def test_tags_token_simple(self):
- result = self.backend.query_for(model.Package).run(query=u'tags:country-sweden')
- assert self._check_entity_names(result, ['se-publications', 'se-opengov']), self._pkg_names(result)
-
- result = self.backend.query_for(model.Package).run(query=u'tags:wildlife')
- assert self._pkg_names(result) == 'us-gov-images', self._pkg_names(result)
-
- def test_tags_token_simple_with_deleted_tag(self):
- # registry has been deleted
- result = self.backend.query_for(model.Package).run(query=u'tags:registry')
- assert self._pkg_names(result) == '', self._pkg_names(result)
-
- def test_tags_token_multiple(self):
- result = self.backend.query_for(model.Package).run(query=u'tags:country-sweden tags:format-pdf')
- assert self._pkg_names(result) == 'se-publications', self._pkg_names(result)
-
- def test_tags_token_complicated(self):
- result = self.backend.query_for(model.Package).run(query=u'tags:country-sweden tags:somethingrandom')
- assert self._pkg_names(result) == '', self._pkg_names(result)
-
- def test_tag_basic(self):
- result = self.backend.query_for('tag').run(query=u'gov')
- assert result['count'] == 2, result
- assert self._check_entity_names(result, ('gov', 'government')), self._pkg_names(result)
-
- def test_tag_basic_2(self):
- result = self.backend.query_for('tag').run(query=u'wildlife')
- assert self._pkg_names(result) == 'wildlife', self._pkg_names(result)
-
- def test_tag_with_tags_option(self):
- result = self.backend.query_for('tag').run(query=u'tags:wildlife')
- assert self._pkg_names(result) == 'wildlife', self._pkg_names(result)
-
- def test_tag_with_blank_tags(self):
- result = self.backend.query_for('tag').run(query=u'tags: wildlife')
- assert self._pkg_names(result) == 'wildlife', self._pkg_names(result)
-
- def test_pagination(self):
- # large search
- all_results = self.backend.query_for(model.Package).run(query=self.q_all)
- all_pkgs = all_results['results']
- all_pkg_count = all_results['count']
-
- # limit
- options = QueryOptions()
- options.limit = 2
- result = self.backend.query_for(model.Package).run(query=self.q_all, options=options)
- pkgs = result['results']
- count = result['count']
- assert len(pkgs) == 2, pkgs
- assert count == all_pkg_count
- assert pkgs == all_pkgs[:2]
-
- # offset
- options = QueryOptions()
- options.limit = 2
- options.offset = 2
- result = self.backend.query_for(model.Package).run(query=self.q_all, options=options)
- pkgs = result['results']
- assert len(pkgs) == 2, pkgs
- assert pkgs == all_pkgs[2:4]
-
- # larger offset
- options = QueryOptions()
- options.limit = 2
- options.offset = 4
- result = self.backend.query_for(model.Package).run(query=self.q_all, options=options)
- pkgs = result['results']
- assert len(pkgs) == 2, pkgs
- assert pkgs == all_pkgs[4:6]
-
- def test_order_by(self):
- # large search
- all_results = self.backend.query_for(model.Package).run(query=self.q_all)
- all_pkgs = all_results['results']
- all_pkg_count = all_results['count']
-
- # rank
- options = QueryOptions()
- options.order_by = 'rank'
- result = self.backend.query_for(model.Package).run(query='penguin', options=options)
- pkgs = result['results']
- fields = [model.Package.by_name(pkg_name).name for pkg_name in pkgs]
- assert fields[0] == 'usa-courts-gov', fields # has penguin three times
- assert pkgs == all_pkgs, pkgs #default ordering
-
- # name
- options = QueryOptions()
- options.order_by = 'name'
- result = self.backend.query_for(model.Package).run(query=self.q_all, options=options)
- pkgs = result['results']
- fields = [model.Package.by_name(pkg_name).name for pkg_name in pkgs]
- sorted_fields = fields; sorted_fields.sort()
- assert fields == sorted_fields, repr(fields) + repr(sorted_fields)
-
- # title
- options = QueryOptions()
- options.order_by = 'title'
- result = self.backend.query_for(model.Package).run(query=self.q_all, options=options)
- pkgs = result['results']
- fields = [model.Package.by_name(pkg_name).title for pkg_name in pkgs]
- sorted_fields = fields; sorted_fields.sort()
- assert fields == sorted_fields, repr(fields) + repr(sorted_fields)
-
- # notes
- options = QueryOptions()
- options.order_by = 'notes'
- result = self.backend.query_for(model.Package).run(query=self.q_all, options=options)
- pkgs = result['results']
- fields = [model.Package.by_name(pkg_name).notes for pkg_name in pkgs]
- sorted_fields = fields; sorted_fields.sort()
- assert fields == sorted_fields, repr(fields) + repr(sorted_fields)
-
- # extra field
-## TODO: Get this working
-## options = SearchOptions({'q':self.q_all})
-## options.order_by = 'date_released'
-## result = Search().run(options)
-## pkgs = result['results']
-## fields = [model.Package.by_name(pkg_name).extras.get('date_released') for pkg_name in pkgs]
-## sorted_fields = fields; sorted_fields.sort()
-## assert fields == sorted_fields, repr(fields) + repr(sorted_fields)
-
- def test_search_notes_on(self):
- result = self.backend.query_for(model.Package).run(query=u'restrictions')
- pkgs = result['results']
- count = result['count']
- assert len(pkgs) == 2, pkgs
-
- def test_search_foreign_chars(self):
- result = self.backend.query_for(model.Package).run(query='umlaut')
- assert result['results'] == ['gils'], result['results']
- result = self.backend.query_for(model.Package).run(query=u'thumb')
- assert result['count'] == 0, result['results']
- result = self.backend.query_for(model.Package).run(query=u'th\xfcmb')
- assert result['results'] == ['gils'], result['results']
-
- # Groups searching deprecated for now
- def _test_groups(self):
- result = self.backend.query_for(model.Package).run(query=u'groups:random')
- assert self._pkg_names(result) == '', self._pkg_names(result)
-
- result = self.backend.query_for(model.Package).run(query=u'groups:ukgov')
- assert result['count'] == 4, self._pkg_names(result)
-
- result = self.backend.query_for(model.Package).run(query=u'groups:ukgov tags:us')
- assert result['count'] == 2, self._pkg_names(result)
-
-class TestSearchOverall(TestController):
- @classmethod
- def setup_class(self):
- indexer = TestSearchIndexer()
- CreateTestData.create()
- indexer.index()
- self.backend = get_backend(backend='sql')
-
- @classmethod
- def teardown_class(self):
- model.repo.rebuild_db()
-
- def _check_search_results(self, terms, expected_count, expected_packages=[], only_open=False, only_downloadable=False):
- options = QueryOptions()
- options.filter_by_openness = only_open
- options.filter_by_downloadable = only_downloadable
- result = self.backend.query_for(model.Package).run(query=unicode(terms))
- pkgs = result['results']
- count = result['count']
- assert count == expected_count, (count, expected_count)
- for expected_pkg in expected_packages:
- assert expected_pkg in pkgs, '%s : %s' % (expected_pkg, result)
-
- def test_overall(self):
- self._check_search_results('annakarenina', 1, ['annakarenina'] )
- self._check_search_results('warandpeace', 1, ['warandpeace'] )
- #self._check_search_results('', 0 )
- self._check_search_results('A Novel By Tolstoy', 1, ['annakarenina'] )
- self._check_search_results('title:Novel', 1, ['annakarenina'] )
- self._check_search_results('title:peace', 0 )
- self._check_search_results('name:warandpeace', 1 )
- self._check_search_results('groups:david', 2 )
- self._check_search_results('groups:roger', 1 )
- self._check_search_results('groups:lenny', 0 )
- self._check_search_results('annakarenina', 1, ['annakarenina'], True, False )
- self._check_search_results('annakarenina', 1, ['annakarenina'], False, True )
- self._check_search_results('annakarenina', 1, ['annakarenina'], True, True )
-
-
-class TestGeographicCoverage(TestController):
- @classmethod
- def setup_class(self):
- indexer = TestSearchIndexer()
- init_data = [
- {'name':'eng',
- 'extras':{'geographic_coverage':'100000: England'},},
- {'name':'eng_ni',
- 'extras':{'geographic_coverage':'100100: England, Northern Ireland'},},
- {'name':'uk',
- 'extras':{'geographic_coverage':'111100: United Kingdom (England, Scotland, Wales, Northern Ireland'},},
- {'name':'gb',
- 'extras':{'geographic_coverage':'111000: Great Britain (England, Scotland, Wales)'},},
- {'name':'none',
- 'extras':{'geographic_coverage':'000000:'},},
- ]
- CreateTestData.create_arbitrary(init_data)
- indexer.index()
- self.backend = get_backend(backend='sql')
-
-
- @classmethod
- def teardown_class(self):
- model.repo.rebuild_db()
-
- def _do_search(self, q, expected_pkgs, count=None):
- options = QueryOptions()
- options.order_by = 'rank'
- result = self.backend.query_for(model.Package).run(query=q, options=options)
- pkgs = result['results']
- fields = [model.Package.by_name(pkg_name).name for pkg_name in pkgs]
- if not (count is None):
- assert result['count'] == count, result['count']
- for expected_pkg in expected_pkgs:
- assert expected_pkg in fields, expected_pkg
-
- def _filtered_search(self, value, expected_pkgs, count=None):
- options = QueryOptions()
- options.order_by = 'rank'
- result = self.backend.query_for(model.Package).run(fields={'geographic_coverage':value}, options=options)
- pkgs = result['results']
- fields = [model.Package.by_name(pkg_name).name for pkg_name in pkgs]
- if not (count is None):
- assert result['count'] == count, result['count']
- for expected_pkg in expected_pkgs:
- assert expected_pkg in fields, expected_pkg
-
- def test_0_basic(self):
- self._do_search(u'england', ['eng', 'eng_ni', 'uk', 'gb'], 4)
- self._do_search(u'northern ireland', ['eng_ni', 'uk'], 2)
- self._do_search(u'united kingdom', ['uk'], 1)
- self._do_search(u'great britain', ['gb'], 1)
-
- def test_1_filtered(self):
- self._filtered_search(u'england', ['eng', 'eng_ni', 'uk', 'gb'], 4)
-
-class TestExtraFields(TestController):
- @classmethod
- def setup_class(self):
- indexer = TestSearchIndexer()
- init_data = [
- {'name':'a',
- 'extras':{'department':'abc',
- 'agency':'ag-a'},},
- {'name':'b',
- 'extras':{'department':'bcd',
- 'agency':'ag-b'},},
- {'name':'c',
- 'extras':{'department':'cde abc'},},
- {'name':'none',
- 'extras':{'department':''},},
- ]
- CreateTestData.create_arbitrary(init_data)
- indexer.index()
- self.backend = get_backend(backend='sql')
-
- @classmethod
- def teardown_class(self):
- model.repo.rebuild_db()
-
- def _do_search(self, department, expected_pkgs, count=None):
- result = self.backend.query_for(model.Package).run(fields={'department':department})
- pkgs = result['results']
- fields = [model.Package.by_name(pkg_name).name for pkg_name in pkgs]
- if not (count is None):
- assert result['count'] == count, result['count']
- for expected_pkg in expected_pkgs:
- assert expected_pkg in fields, expected_pkg
-
- def test_0_basic(self):
- self._do_search(u'bcd', 'b', 1)
- self._do_search(u'abc', ['a', 'c'], 2)
- self._do_search(u'cde', 'c', 1)
- self._do_search(u'abc cde', 'c', 1)
- self._do_search(u'cde abc', 'c', 1)
-
-class TestRank(TestController):
- @classmethod
- def setup_class(self):
- indexer = TestSearchIndexer()
- init_data = [{'name':u'test1-penguin-canary',
- 'tags':u'canary goose squirrel wombat wombat'},
- {'name':u'test2-squirrel-squirrel-canary-goose',
- 'tags':u'penguin wombat'},
- ]
- CreateTestData.create_arbitrary(init_data)
- self.pkg_names = [u'test1-penguin-canary',
- u'test2-squirrel-squirrel-canary-goose']
- indexer.index()
- self.backend = get_backend(backend='sql')
-
- @classmethod
- def teardown_class(self):
- model.repo.rebuild_db()
-
- def _do_search(self, q, wanted_results):
- options = QueryOptions()
- options.order_by = 'rank'
- result = self.backend.query_for(model.Package).run(query=q, options=options)
- results = result['results']
- err = 'Wanted %r, got %r' % (wanted_results, results)
- assert wanted_results[0] == results[0], err
- assert wanted_results[1] == results[1], err
-
- def test_0_basic(self):
- self._do_search(u'wombat', self.pkg_names)
- self._do_search(u'squirrel', self.pkg_names[::-1])
- self._do_search(u'canary', self.pkg_names)
-
- def test_1_weighting(self):
- self._do_search(u'penguin', self.pkg_names)
- self._do_search(u'goose', self.pkg_names[::-1])
-
--- a/ckan/tests/lib/test_package_search_synchronous_update.py Thu Aug 18 13:42:59 2011 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,137 +0,0 @@
-import json
-
-from ckan.tests import *
-from ckan.tests import is_search_supported
-import ckan.lib.search as search
-from ckan import plugins
-from test_package_search import TestSearchOverall
-from ckan import model
-
-class TestSearchOverallWithSynchronousIndexing(TestSearchOverall):
- '''Repeat test from test_package_search with synchronous indexing
- '''
-
- @classmethod
- def setup_class(self):
- if not is_search_supported():
- raise SkipTest("Search not supported")
-
- import gc
- from pylons import config
-
- # Force a garbage collection to trigger issue #695
- gc.collect()
-
- config['search_backend'] = 'sql'
- self.backend = search.get_backend()
- plugins.load('synchronous_search')
- CreateTestData.create()
-
- def test_01_search_table_count(self):
-
- assert model.Session.query(model.PackageSearch).count() == 2
-
- def test_02_add_package_from_dict(self):
-
- print self.create_package_from_data.__doc__
- self.package = self.create_package_from_data(json.loads(str(self.create_package_from_data.__doc__)))
-
- assert model.Session.query(model.PackageSearch).count() == 3
-
- self._check_search_results('wee', 1, ['council-owned-litter-bins'])
-
- def test_03_update_package_from_dict(self):
-
- package = model.Package.by_name('council-owned-litter-bins')
-
-
- update_dict = json.loads(str(self.create_package_from_data.__doc__))
- update_dict['name'] = 'new_name'
- update_dict['extras']['published_by'] = 'meeeee'
-
- self.create_package_from_data(update_dict, package)
- assert model.Session.query(model.PackageSearch).count() == 3
-
- self._check_search_results('meeeee', 1, ['new_name'])
-
- def test_04_delete_package_from_dict(self):
-
- package = model.Package.by_name('new_name')
-
- model.Session.delete(package)
- model.Session.commit()
-
- assert model.Session.query(model.PackageSearch).count() == 2
-
- def create_package_from_data(self, package_data, package = None):
- ''' {"extras": {"INSPIRE": "True",
- "bbox-east-long": "-3.12442",
- "bbox-north-lat": "54.218407",
- "bbox-south-lat": "54.039634",
- "bbox-west-long": "-3.32485",
- "constraint": "conditions unknown; (e) intellectual property rights;",
- "dataset-reference-date": [{"type": "creation",
- "value": "2008-10-10"},
- {"type": "revision",
- "value": "2009-10-08"}],
- "guid": "00a743bf-cca4-4c19-a8e5-e64f7edbcadd",
- "metadata-date": "2009-10-16",
- "metadata-language": "eng",
- "published_by": 0,
- "resource-type": "dataset",
- "spatial-reference-system": "wee",
- "temporal_coverage-from": "1977-03-10T11:45:30",
- "temporal_coverage-to": "2005-01-15T09:10:00"},
- "name": "council-owned-litter-bins",
- "notes": "Location of Council owned litter bins within Borough.",
- "resources": [{"description": "Resource locator",
- "format": "Unverified",
- "url": "http://www.barrowbc.gov.uk"}],
- "tags": ["Utility and governmental services"],
- "title": "Council Owned Litter Bins"}
- '''
-
- if not package:
- package = model.Package()
-
- rev = model.repo.new_revision()
-
- relationship_attr = ['extras', 'resources', 'tags']
-
- package_properties = {}
- for key, value in package_data.iteritems():
- if key not in relationship_attr:
- setattr(package, key, value)
-
- tags = package_data.get('tags', [])
-
- for tag in tags:
- package.add_tag_by_name(tag, autoflush=False)
-
- for resource_dict in package_data.get("resources", []):
- resource = model.Resource(**resource_dict)
- package.resources[:] = []
- package.resources.append(resource)
-
- for key, value in package_data.get("extras", {}).iteritems():
- extra = model.PackageExtra(key=key, value=value)
- package._extras[key] = extra
-
- model.Session.add(package)
- model.Session.flush()
-
- model.setup_default_user_roles(package, [])
-
-
- model.Session.add(rev)
- model.Session.commit()
-
- return package
-
-
- @classmethod
- def teardown_class(self):
- model.repo.rebuild_db()
-
-# Stop parent class tests from running
-#TestSearchOverall = None
--- a/ckan/tests/lib/test_resource_search.py Thu Aug 18 13:42:59 2011 +0100
+++ b/ckan/tests/lib/test_resource_search.py Thu Aug 18 14:02:12 2011 +0100
@@ -3,7 +3,7 @@
from ckan.tests import *
from ckan.tests import is_search_supported
-from ckan.lib.search import get_backend, QueryOptions
+from ckan.lib.search import QueryOptions
from ckan import model
from ckan.lib.create_test_data import CreateTestData
from ckan.lib.search.common import SearchError
@@ -11,6 +11,8 @@
class TestSearch(object):
@classmethod
def setup_class(self):
+ raise SkipTest("Resource search not yet implemented with solr")
+
if not is_search_supported():
raise SkipTest("Search not supported")
--- a/ckan/tests/lib/test_search_index.py Thu Aug 18 13:42:59 2011 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,87 +0,0 @@
-import time
-
-import sqlalchemy as sa
-
-from ckan.tests import *
-from ckan.tests import is_search_supported
-from ckan import model
-import ckan.lib.search as search
-
-class TestSearchIndex(TestController):
- '''Tests that a package is indexed when the packagenotification is
- received by the indexer.'''
- worker = None
-
- @classmethod
- def setup_class(cls):
- if not is_search_supported():
- raise SkipTest("Search not supported")
- CreateTestData.create()
-
- @classmethod
- def teardown_class(cls):
- model.repo.rebuild_db()
-
- def test_index(self):
- search.dispatch_by_operation('Package', {'title': 'penguin'}, 'new',
- backend=search.get_backend(backend='sql'))
-
- sql = "select search_vector from package_search where package_id='%s'" % self.anna.id
- vector = model.Session.execute(sql).fetchone()[0]
- assert 'annakarenina' in vector, vector
- assert not 'penguin' in vector, vector
-
-
-class PostgresSearch(object):
- '''Demo of how postgres search works.'''
- def filter_by(self, query, terms):
- q = query
- q = q.filter(model.package_search_table.c.package_id==model.Package.id)
- q = q.filter('package_search.search_vector '\
- '@@ plainto_tsquery(:terms)')
- q = q.params(terms=terms)
- q = q.add_column(sa.func.ts_rank_cd('package_search.search_vector', sa.func.plainto_tsquery(terms)))
- return q
-
- def order_by(self, query):
- return query.order_by('ts_rank_cd_1')
-
- def search(self, terms):
- import ckan.model as model
- q = self.filter_by(model.Session.query(model.Package), terms)
- q = self.order_by(q)
- q = q.distinct()
- results = [pkg_tuple[0].name for pkg_tuple in q.all()]
- return {'results':results, 'count':q.count()}
-
-
-def allow_time_to_create_search_index():
- time.sleep(0.5)
-
-class TestPostgresSearch:
- @classmethod
- def setup_class(self):
- tsi = TestSearchIndexer()
- CreateTestData.create_search_test_data()
- tsi.index()
-
- self.gils = model.Package.by_name(u'gils')
- self.war = model.Package.by_name(u'warandpeace')
- self.russian = model.Tag.by_name(u'russian')
- self.tolstoy = model.Tag.by_name(u'tolstoy')
-
- @classmethod
- def teardown_class(self):
- model.repo.rebuild_db()
-
- def test_0_indexing(self):
- searches = model.metadata.bind.execute('SELECT package_id, search_vector FROM package_search').fetchall()
- assert searches[0][1], searches
- q = model.Session.query(model.Package).filter(model.package_search_table.c.package_id==model.Package.id)
- assert q.count() == 6, q.count()
-
- def test_1_basic(self):
- result = PostgresSearch().search(u'sweden')
- assert 'se-publications' in result['results'], result['results']
- assert result['count'] == 2, result['count']
-
Repository URL: https://bitbucket.org/okfn/ckan/
--
This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.
More information about the ckan-changes
mailing list