[ckan-changes] commit/ckanext-solr: 2 new changesets
Bitbucket
commits-noreply at bitbucket.org
Thu Aug 11 15:54:40 UTC 2011
2 new changesets in ckanext-solr:
http://bitbucket.org/okfn/ckanext-solr/changeset/27d20d7a0d16/
changeset: 27d20d7a0d16
user: John Glover
date: 2011-08-11 17:53:43
summary: add name field to default search (text)
affected #: 1 file (142 bytes)
--- a/schema.xml Thu Aug 11 14:10:52 2011 +0100
+++ b/schema.xml Thu Aug 11 16:53:43 2011 +0100
@@ -18,7 +18,7 @@
<schema name="ckan" version="1.2">
- <types>
+<types><fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/><fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true"/><fieldtype name="binary" class="solr.BinaryField"/>
@@ -34,131 +34,129 @@
<fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/><fieldType name="text" class="solr.TextField" positionIncrementGap="100">
- <analyzer type="index">
- <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- <!-- in this example, we will only use synonyms at query time
- <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
- -->
- <!-- Case insensitive stop word removal.
- add enablePositionIncrements=true in both the index and query
- analyzers to leave a 'gap' for more accurate phrase queries.
- -->
- <filter class="solr.StopFilterFactory"
+ <analyzer type="index">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <!-- in this example, we will only use synonyms at query time
+ <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+ -->
+ <!-- Case insensitive stop word removal.
+ add enablePositionIncrements=true in both the index and query
+ analyzers to leave a 'gap' for more accurate phrase queries.
+ -->
+ <filter class="solr.StopFilterFactory"
ignoreCase="true"
words="stopwords.txt"
enablePositionIncrements="true"
/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
- </analyzer>
- <analyzer type="query">
- <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
- <filter class="solr.StopFilterFactory"
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+ <filter class="solr.StopFilterFactory"
ignoreCase="true"
words="stopwords.txt"
enablePositionIncrements="true"
/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
- </analyzer>
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
+ </analyzer></fieldType><!-- A general unstemmed text field - good if one does not know the language of the field --><fieldType name="textgen" class="solr.TextField" positionIncrementGap="100">
- <analyzer type="index">
- <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- </analyzer>
- <analyzer type="query">
- <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
- <filter class="solr.StopFilterFactory"
+ <analyzer type="index">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+ <filter class="solr.StopFilterFactory"
ignoreCase="true"
words="stopwords.txt"
enablePositionIncrements="true"
/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- </analyzer>
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer></fieldType>
- </types>
+</types>
- <fields>
- <field name="id" type="string" indexed="true" stored="true" required="true" />
- <field name="site_id" type="string" indexed="true" stored="true" required="true" />
- <field name="title" type="text" indexed="true" stored="true" />
- <field name="entity_type" type="string" indexed="true" stored="true" omitNorms="true" />
- <field name="state" type="string" indexed="true" stored="true" omitNorms="true" />
- <field name="name" type="string" indexed="true" stored="true" omitNorms="true" />
- <field name="revision_id" type="string" indexed="true" stored="true" omitNorms="true" />
- <field name="version" type="string" indexed="true" stored="true" />
- <field name="url" type="string" indexed="true" stored="true" omitNorms="true" />
- <field name="ckan_url" type="string" indexed="true" stored="true" omitNorms="true" />
- <field name="download_url" type="string" indexed="true" stored="true" omitNorms="true" />
- <field name="notes" type="text" indexed="true" stored="true"/>
- <field name="author" type="textgen" indexed="true" stored="true" />
- <field name="author_email" type="textgen" indexed="true" stored="true" />
- <field name="maintainer" type="textgen" indexed="true" stored="true" />
- <field name="maintainer_email" type="textgen" indexed="true" stored="true" />
- <field name="license" type="string" indexed="true" stored="true" />
- <field name="license_id" type="string" indexed="true" stored="true" />
- <field name="ratings_count" type="int" indexed="true" stored="false" />
- <field name="ratings_average" type="float" indexed="true" stored="false" />
- <field name="tags" type="string" indexed="true" stored="true" multiValued="true"/>
- <field name="groups" type="string" indexed="true" stored="true" multiValued="true"/>
-
- <field name="res_description" type="textgen" indexed="true" stored="true" multiValued="true"/>
- <field name="res_format" type="string" indexed="true" stored="true" multiValued="true"/>
- <field name="res_url" type="string" indexed="true" stored="true" multiValued="true"/>
+<fields>
+ <field name="id" type="string" indexed="true" stored="true" required="true" />
+ <field name="site_id" type="string" indexed="true" stored="true" required="true" />
+ <field name="title" type="text" indexed="true" stored="true" />
+ <field name="entity_type" type="string" indexed="true" stored="true" omitNorms="true" />
+ <field name="state" type="string" indexed="true" stored="true" omitNorms="true" />
+ <field name="name" type="string" indexed="true" stored="true" omitNorms="true" />
+ <field name="revision_id" type="string" indexed="true" stored="true" omitNorms="true" />
+ <field name="version" type="string" indexed="true" stored="true" />
+ <field name="url" type="string" indexed="true" stored="true" omitNorms="true" />
+ <field name="ckan_url" type="string" indexed="true" stored="true" omitNorms="true" />
+ <field name="download_url" type="string" indexed="true" stored="true" omitNorms="true" />
+ <field name="notes" type="text" indexed="true" stored="true"/>
+ <field name="author" type="textgen" indexed="true" stored="true" />
+ <field name="author_email" type="textgen" indexed="true" stored="true" />
+ <field name="maintainer" type="textgen" indexed="true" stored="true" />
+ <field name="maintainer_email" type="textgen" indexed="true" stored="true" />
+ <field name="license" type="string" indexed="true" stored="true" />
+ <field name="license_id" type="string" indexed="true" stored="true" />
+ <field name="ratings_count" type="int" indexed="true" stored="false" />
+ <field name="ratings_average" type="float" indexed="true" stored="false" />
+ <field name="tags" type="string" indexed="true" stored="true" multiValued="true"/>
+ <field name="groups" type="string" indexed="true" stored="true" multiValued="true"/>
- <!-- catchall field, containing all other searchable text fields (implemented
- via copyField further on in this schema -->
- <field name="text" type="text" indexed="true" stored="false" multiValued="true"/>
- <field name="urls" type="text" indexed="true" stored="false" multiValued="true"/>
+ <field name="res_description" type="textgen" indexed="true" stored="true" multiValued="true"/>
+ <field name="res_format" type="string" indexed="true" stored="true" multiValued="true"/>
+ <field name="res_url" type="string" indexed="true" stored="true" multiValued="true"/>
- <field name="depends_on" type="text" indexed="true" stored="false" multiValued="true"/>
- <field name="dependency_of" type="text" indexed="true" stored="false" multiValued="true"/>
- <field name="derives_from" type="text" indexed="true" stored="false" multiValued="true"/>
- <field name="has_derivation" type="text" indexed="true" stored="false" multiValued="true"/>
- <field name="links_to" type="text" indexed="true" stored="false" multiValued="true"/>
- <field name="linked_from" type="text" indexed="true" stored="false" multiValued="true"/>
- <field name="child_of" type="text" indexed="true" stored="false" multiValued="true"/>
- <field name="parent_of" type="text" indexed="true" stored="false" multiValued="true"/>
- <field name="extras_*" type="text" indexed="true" stored="false" multiValued="true"/>
+ <!-- catchall field, containing all other searchable text fields (implemented
+ via copyField further on in this schema -->
+ <field name="text" type="text" indexed="true" stored="false" multiValued="true"/>
+ <field name="urls" type="text" indexed="true" stored="false" multiValued="true"/>
- <field name="indexed_ts" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>
-
- <dynamicField name="*" type="string" indexed="true" stored="false"/>
- </fields>
+ <field name="depends_on" type="text" indexed="true" stored="false" multiValued="true"/>
+ <field name="dependency_of" type="text" indexed="true" stored="false" multiValued="true"/>
+ <field name="derives_from" type="text" indexed="true" stored="false" multiValued="true"/>
+ <field name="has_derivation" type="text" indexed="true" stored="false" multiValued="true"/>
+ <field name="links_to" type="text" indexed="true" stored="false" multiValued="true"/>
+ <field name="linked_from" type="text" indexed="true" stored="false" multiValued="true"/>
+ <field name="child_of" type="text" indexed="true" stored="false" multiValued="true"/>
+ <field name="parent_of" type="text" indexed="true" stored="false" multiValued="true"/>
+ <field name="extras_*" type="text" indexed="true" stored="false" multiValued="true"/>
- <uniqueKey>id</uniqueKey>
- <defaultSearchField>text</defaultSearchField>
- <solrQueryParser defaultOperator="AND"/>
+ <field name="indexed_ts" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>
- <copyField source="url" dest="urls"/>
- <copyField source="ckan_url" dest="urls"/>
- <copyField source="download_url" dest="urls"/>
- <copyField source="res_url" dest="urls"/>
-
+ <dynamicField name="*" type="string" indexed="true" stored="false"/>
+</fields>
- <copyField source="extras_*" dest="text"/>
- <copyField source="urls" dest="text"/>
- <copyField source="title" dest="text"/>
- <copyField source="text" dest="text"/>
- <copyField source="license" dest="text"/>
- <copyField source="notes" dest="text"/>
- <copyField source="tags" dest="text"/>
- <copyField source="groups" dest="text"/>
- <copyField source="res_description" dest="text"/>
- <copyField source="maintainer" dest="text"/>
- <copyField source="author" dest="text"/>
+<uniqueKey>id</uniqueKey>
+<defaultSearchField>text</defaultSearchField>
+<solrQueryParser defaultOperator="AND"/>
+
+<copyField source="url" dest="urls"/>
+<copyField source="ckan_url" dest="urls"/>
+<copyField source="download_url" dest="urls"/>
+<copyField source="res_url" dest="urls"/>
+<copyField source="extras_*" dest="text"/>
+<copyField source="urls" dest="text"/>
+<copyField source="name" dest="text"/>
+<copyField source="title" dest="text"/>
+<copyField source="text" dest="text"/>
+<copyField source="license" dest="text"/>
+<copyField source="notes" dest="text"/>
+<copyField source="tags" dest="text"/>
+<copyField source="groups" dest="text"/>
+<copyField source="res_description" dest="text"/>
+<copyField source="maintainer" dest="text"/>
+<copyField source="author" dest="text"/></schema>
-
http://bitbucket.org/okfn/ckanext-solr/changeset/1704dcb0c7b6/
changeset: 1704dcb0c7b6
user: John Glover
date: 2011-08-11 17:53:55
summary: Update package search tests
affected #: 1 file (4.3 KB)
--- a/tests/test_package_search.py Thu Aug 11 16:53:43 2011 +0100
+++ b/tests/test_package_search.py Thu Aug 11 16:53:55 2011 +0100
@@ -1,4 +1,3 @@
-import solr
from pylons import config
from ckan.tests import TestController, CreateTestData
from ckan import model
@@ -22,15 +21,13 @@
del gils.tags[idx]
model.repo.commit_and_remove()
# solr
- cls.solr = solr.SolrConnection(config.get('solr_url'))
- cls.fq = " +site_id:\"%s\" " % config.get('ckan.site_id')
search.rebuild()
cls.backend = search.get_backend()
@classmethod
def teardown_class(cls):
model.repo.rebuild_db()
- cls.solr.close()
+ search.get_backend().index_for('Package').clear()
def _pkg_names(self, result):
return ' '.join(result['results'])
@@ -118,25 +115,6 @@
result = self.backend.query_for(model.Package).run(query=u'tags:country-sweden tags:somethingrandom')
assert self._pkg_names(result) == '', self._pkg_names(result)
- # TODO: cannot search tags from solr. Should we index tags?
-
- # def test_tag_basic(self):
- # result = self.backend.query_for('tag').run(query=u'gov')
- # assert result['count'] == 2, result
- # assert self._check_entity_names(result, ('gov', 'government')), self._pkg_names(result)
-
- # def test_tag_basic_2(self):
- # result = self.backend.query_for('tag').run(query=u'wildlife')
- # assert self._pkg_names(result) == 'wildlife', self._pkg_names(result)
-
- # def test_tag_with_tags_option(self):
- # result = self.backend.query_for('tag').run(query=u'tags:wildlife')
- # assert self._pkg_names(result) == 'wildlife', self._pkg_names(result)
-
- # def test_tag_with_blank_tags(self):
- # result = self.backend.query_for('tag').run(query=u'tags: wildlife')
- # assert self._pkg_names(result) == 'wildlife', self._pkg_names(result)
-
def test_pagination(self):
# large search
all_results = self.backend.query_for(model.Package).run(query=self.q_all)
@@ -170,3 +148,118 @@
pkgs = result['results']
assert len(pkgs) == 2, pkgs
assert pkgs == all_pkgs[4:6]
+
+ def test_order_by(self):
+ # large search
+ all_results = self.backend.query_for(model.Package).run(query=self.q_all)
+ all_pkgs = all_results['results']
+ all_pkg_count = all_results['count']
+
+ # rank
+ # TODO: fix this test
+ # options = search.QueryOptions()
+ # options.order_by = 'rank'
+ # result = self.backend.query_for(model.Package).run(query='penguin', options=options)
+ # pkgs = result['results']
+ # fields = [model.Package.by_name(pkg_name).name for pkg_name in pkgs]
+ # assert fields[0] == 'usa-courts-gov', fields # has penguin three times
+ # assert pkgs == all_pkgs, pkgs #default ordering
+
+ # name
+ options = search.QueryOptions()
+ options.order_by = 'name'
+ result = self.backend.query_for(model.Package).run(query=self.q_all, options=options)
+ pkgs = result['results']
+ fields = [model.Package.by_name(pkg_name).name for pkg_name in pkgs]
+ sorted_fields = fields; sorted_fields.sort()
+ assert fields == sorted_fields, repr(fields) + repr(sorted_fields)
+
+ # title
+ options = search.QueryOptions()
+ options.order_by = 'title'
+ result = self.backend.query_for(model.Package).run(query=self.q_all, options=options)
+ pkgs = result['results']
+ fields = [model.Package.by_name(pkg_name).title for pkg_name in pkgs]
+ sorted_fields = fields; sorted_fields.sort()
+ assert fields == sorted_fields, repr(fields) + repr(sorted_fields)
+
+ # notes
+ options = search.QueryOptions()
+ options.order_by = 'notes'
+ result = self.backend.query_for(model.Package).run(query=self.q_all, options=options)
+ pkgs = result['results']
+ fields = [model.Package.by_name(pkg_name).notes for pkg_name in pkgs]
+ sorted_fields = fields; sorted_fields.sort()
+ assert fields == sorted_fields, repr(fields) + repr(sorted_fields)
+
+ # extra field
+ # TODO: Fix this test
+ # options = SearchOptions({'q':self.q_all})
+ # options.order_by = 'date_released'
+ # result = Search().run(options)
+ # pkgs = result['results']
+ # fields = [model.Package.by_name(pkg_name).extras.get('date_released') for pkg_name in pkgs]
+ # sorted_fields = fields; sorted_fields.sort()
+ # assert fields == sorted_fields, repr(fields) + repr(sorted_fields)
+
+ def test_search_notes_on(self):
+ result = self.backend.query_for(model.Package).run(query=u'restrictions')
+ pkgs = result['results']
+ count = result['count']
+ assert len(pkgs) == 2, pkgs
+
+ def test_search_foreign_chars(self):
+ result = self.backend.query_for(model.Package).run(query='umlaut')
+ assert result['results'] == ['gils'], result['results']
+ result = self.backend.query_for(model.Package).run(query=u'thumb')
+ assert result['count'] == 0, result['results']
+ result = self.backend.query_for(model.Package).run(query=u'th\xfcmb')
+ assert result['results'] == ['gils'], result['results']
+
+ def test_groups(self):
+ result = self.backend.query_for(model.Package).run(query=u'groups:random')
+ assert self._pkg_names(result) == '', self._pkg_names(result)
+ result = self.backend.query_for(model.Package).run(query=u'groups:ukgov')
+ assert result['count'] == 4, self._pkg_names(result)
+ result = self.backend.query_for(model.Package).run(query=u'groups:ukgov tags:us')
+ assert result['count'] == 2, self._pkg_names(result)
+
+class TestSearchOverall(TestController):
+ @classmethod
+ def setup_class(cls):
+ CreateTestData.create()
+ search.rebuild()
+ cls.backend = search.get_backend()
+
+ @classmethod
+ def teardown_class(cls):
+ model.repo.rebuild_db()
+ search.get_backend().index_for('Package').clear()
+
+ def _check_search_results(self, terms, expected_count, expected_packages=[], only_open=False, only_downloadable=False):
+ options = search.QueryOptions()
+ options.filter_by_openness = only_open
+ options.filter_by_downloadable = only_downloadable
+ result = self.backend.query_for(model.Package).run(query=unicode(terms))
+ pkgs = result['results']
+ count = result['count']
+ assert count == expected_count, (count, expected_count)
+ for expected_pkg in expected_packages:
+ assert expected_pkg in pkgs, '%s : %s' % (expected_pkg, result)
+
+ def test_overall(self):
+ self._check_search_results('annakarenina', 1, ['annakarenina'])
+ self._check_search_results('warandpeace', 1, ['warandpeace'])
+ self._check_search_results('', 2)
+ self._check_search_results('A Novel By Tolstoy', 1, ['annakarenina'])
+ self._check_search_results('title:Novel', 1, ['annakarenina'])
+ self._check_search_results('title:peace', 0)
+ self._check_search_results('name:warandpeace', 1)
+ self._check_search_results('groups:david', 2)
+ self._check_search_results('groups:roger', 1)
+ self._check_search_results('groups:lenny', 0)
+ self._check_search_results('annakarenina', 1, ['annakarenina'], True, False)
+ self._check_search_results('annakarenina', 1, ['annakarenina'], False, True)
+ self._check_search_results('annakarenina', 1, ['annakarenina'], True, True)
+
+
Repository URL: https://bitbucket.org/okfn/ckanext-solr/
--
This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.
More information about the ckan-changes
mailing list