[ckan-changes] commit/ckanext-solr: 2 new changesets

Bitbucket commits-noreply at bitbucket.org
Thu Aug 11 15:54:40 UTC 2011


2 new changesets in ckanext-solr:

http://bitbucket.org/okfn/ckanext-solr/changeset/27d20d7a0d16/
changeset:   27d20d7a0d16
user:        John Glover
date:        2011-08-11 17:53:43
summary:     add name field to default search (text)
affected #:  1 file (142 bytes)

--- a/schema.xml	Thu Aug 11 14:10:52 2011 +0100
+++ b/schema.xml	Thu Aug 11 16:53:43 2011 +0100
@@ -18,7 +18,7 @@
 
 <schema name="ckan" version="1.2">
 
-  <types>
+<types><fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/><fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true"/><fieldtype name="binary" class="solr.BinaryField"/>
@@ -34,131 +34,129 @@
     <fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/><fieldType name="text" class="solr.TextField" positionIncrementGap="100">
-      <analyzer type="index">
-        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
-        <!-- in this example, we will only use synonyms at query time
-        <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
-        -->
-        <!-- Case insensitive stop word removal.
-          add enablePositionIncrements=true in both the index and query
-          analyzers to leave a 'gap' for more accurate phrase queries.
-        -->
-        <filter class="solr.StopFilterFactory"
+        <analyzer type="index">
+            <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+            <!-- in this example, we will only use synonyms at query time
+            <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+            -->
+            <!-- Case insensitive stop word removal.
+              add enablePositionIncrements=true in both the index and query
+              analyzers to leave a 'gap' for more accurate phrase queries.
+            -->
+            <filter class="solr.StopFilterFactory"
                 ignoreCase="true"
                 words="stopwords.txt"
                 enablePositionIncrements="true"
                 />
-        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
-        <filter class="solr.LowerCaseFilterFactory"/>
-        <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
-      </analyzer>
-      <analyzer type="query">
-        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
-        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
-        <filter class="solr.StopFilterFactory"
+            <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
+            <filter class="solr.LowerCaseFilterFactory"/>
+            <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
+        </analyzer>
+        <analyzer type="query">
+            <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+            <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+            <filter class="solr.StopFilterFactory"
                 ignoreCase="true"
                 words="stopwords.txt"
                 enablePositionIncrements="true"
                 />
-        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
-        <filter class="solr.LowerCaseFilterFactory"/>
-        <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
-      </analyzer>
+            <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
+            <filter class="solr.LowerCaseFilterFactory"/>
+            <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
+        </analyzer></fieldType><!-- A general unstemmed text field - good if one does not know the language of the field --><fieldType name="textgen" class="solr.TextField" positionIncrementGap="100">
-      <analyzer type="index">
-        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
-        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
-        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/>
-        <filter class="solr.LowerCaseFilterFactory"/>
-      </analyzer>
-      <analyzer type="query">
-        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
-        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
-        <filter class="solr.StopFilterFactory"
+        <analyzer type="index">
+            <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+            <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+            <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/>
+            <filter class="solr.LowerCaseFilterFactory"/>
+        </analyzer>
+        <analyzer type="query">
+            <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+            <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+            <filter class="solr.StopFilterFactory"
                 ignoreCase="true"
                 words="stopwords.txt"
                 enablePositionIncrements="true"
                 />
-        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/>
-        <filter class="solr.LowerCaseFilterFactory"/>
-      </analyzer>
+            <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/>
+            <filter class="solr.LowerCaseFilterFactory"/>
+        </analyzer></fieldType>
- </types>
+</types>
 
 
- <fields>
-   <field name="id" type="string" indexed="true" stored="true" required="true" /> 
-     <field name="site_id" type="string" indexed="true" stored="true" required="true" /> 
-   <field name="title" type="text" indexed="true" stored="true" />
-     <field name="entity_type" type="string" indexed="true" stored="true" omitNorms="true" />
-     <field name="state" type="string" indexed="true" stored="true" omitNorms="true" />
-   <field name="name" type="string" indexed="true" stored="true" omitNorms="true" />
-   <field name="revision_id" type="string" indexed="true" stored="true" omitNorms="true" />
-   <field name="version" type="string" indexed="true" stored="true" /> 
-   <field name="url" type="string" indexed="true" stored="true" omitNorms="true" />
-   <field name="ckan_url" type="string" indexed="true" stored="true" omitNorms="true" />
-     <field name="download_url" type="string" indexed="true" stored="true" omitNorms="true" />
-     <field name="notes" type="text" indexed="true" stored="true"/>
-     <field name="author" type="textgen" indexed="true" stored="true" />
-     <field name="author_email" type="textgen" indexed="true" stored="true" />
-     <field name="maintainer" type="textgen" indexed="true" stored="true" />
-     <field name="maintainer_email" type="textgen" indexed="true" stored="true" />
-   <field name="license" type="string" indexed="true" stored="true" />
-     <field name="license_id" type="string" indexed="true" stored="true" />
-   <field name="ratings_count" type="int" indexed="true" stored="false" />
-   <field name="ratings_average" type="float" indexed="true" stored="false" />
-     <field name="tags" type="string" indexed="true" stored="true" multiValued="true"/>
-   <field name="groups" type="string" indexed="true" stored="true" multiValued="true"/>
-     
-     <field name="res_description" type="textgen" indexed="true" stored="true" multiValued="true"/>
-     <field name="res_format" type="string" indexed="true" stored="true" multiValued="true"/>
-   <field name="res_url" type="string" indexed="true" stored="true" multiValued="true"/>
+<fields>
+    <field name="id" type="string" indexed="true" stored="true" required="true" /> 
+    <field name="site_id" type="string" indexed="true" stored="true" required="true" /> 
+    <field name="title" type="text" indexed="true" stored="true" />
+    <field name="entity_type" type="string" indexed="true" stored="true" omitNorms="true" />
+    <field name="state" type="string" indexed="true" stored="true" omitNorms="true" />
+    <field name="name" type="string" indexed="true" stored="true" omitNorms="true" />
+    <field name="revision_id" type="string" indexed="true" stored="true" omitNorms="true" />
+    <field name="version" type="string" indexed="true" stored="true" /> 
+    <field name="url" type="string" indexed="true" stored="true" omitNorms="true" />
+    <field name="ckan_url" type="string" indexed="true" stored="true" omitNorms="true" />
+    <field name="download_url" type="string" indexed="true" stored="true" omitNorms="true" />
+    <field name="notes" type="text" indexed="true" stored="true"/>
+    <field name="author" type="textgen" indexed="true" stored="true" />
+    <field name="author_email" type="textgen" indexed="true" stored="true" />
+    <field name="maintainer" type="textgen" indexed="true" stored="true" />
+    <field name="maintainer_email" type="textgen" indexed="true" stored="true" />
+    <field name="license" type="string" indexed="true" stored="true" />
+    <field name="license_id" type="string" indexed="true" stored="true" />
+    <field name="ratings_count" type="int" indexed="true" stored="false" />
+    <field name="ratings_average" type="float" indexed="true" stored="false" />
+    <field name="tags" type="string" indexed="true" stored="true" multiValued="true"/>
+    <field name="groups" type="string" indexed="true" stored="true" multiValued="true"/>
 
-   <!-- catchall field, containing all other searchable text fields (implemented
-        via copyField further on in this schema  -->
-   <field name="text" type="text" indexed="true" stored="false" multiValued="true"/>
-   <field name="urls" type="text" indexed="true" stored="false" multiValued="true"/>
+    <field name="res_description" type="textgen" indexed="true" stored="true" multiValued="true"/>
+    <field name="res_format" type="string" indexed="true" stored="true" multiValued="true"/>
+    <field name="res_url" type="string" indexed="true" stored="true" multiValued="true"/>
 
-   <field name="depends_on" type="text" indexed="true" stored="false" multiValued="true"/>
-   <field name="dependency_of" type="text" indexed="true" stored="false" multiValued="true"/>
-   <field name="derives_from" type="text" indexed="true" stored="false" multiValued="true"/>
-   <field name="has_derivation" type="text" indexed="true" stored="false" multiValued="true"/>
-   <field name="links_to" type="text" indexed="true" stored="false" multiValued="true"/>
-   <field name="linked_from" type="text" indexed="true" stored="false" multiValued="true"/>
-   <field name="child_of" type="text" indexed="true" stored="false" multiValued="true"/>
-   <field name="parent_of" type="text" indexed="true" stored="false" multiValued="true"/>
-   <field name="extras_*" type="text" indexed="true" stored="false" multiValued="true"/>
+    <!-- catchall field, containing all other searchable text fields (implemented
+         via copyField further on in this schema  -->
+    <field name="text" type="text" indexed="true" stored="false" multiValued="true"/>
+    <field name="urls" type="text" indexed="true" stored="false" multiValued="true"/>
 
-   <field name="indexed_ts" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>
-   
-   <dynamicField name="*" type="string" indexed="true"  stored="false"/>
- </fields>
+    <field name="depends_on" type="text" indexed="true" stored="false" multiValued="true"/>
+    <field name="dependency_of" type="text" indexed="true" stored="false" multiValued="true"/>
+    <field name="derives_from" type="text" indexed="true" stored="false" multiValued="true"/>
+    <field name="has_derivation" type="text" indexed="true" stored="false" multiValued="true"/>
+    <field name="links_to" type="text" indexed="true" stored="false" multiValued="true"/>
+    <field name="linked_from" type="text" indexed="true" stored="false" multiValued="true"/>
+    <field name="child_of" type="text" indexed="true" stored="false" multiValued="true"/>
+    <field name="parent_of" type="text" indexed="true" stored="false" multiValued="true"/>
+    <field name="extras_*" type="text" indexed="true" stored="false" multiValued="true"/>
 
- <uniqueKey>id</uniqueKey>
- <defaultSearchField>text</defaultSearchField>
- <solrQueryParser defaultOperator="AND"/>
+    <field name="indexed_ts" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>
 
- <copyField source="url" dest="urls"/>
- <copyField source="ckan_url" dest="urls"/>
- <copyField source="download_url" dest="urls"/>
- <copyField source="res_url" dest="urls"/>
- 
+    <dynamicField name="*" type="string" indexed="true"  stored="false"/>
+</fields>
 
- <copyField source="extras_*" dest="text"/>
- <copyField source="urls" dest="text"/>
- <copyField source="title" dest="text"/>
- <copyField source="text" dest="text"/>
- <copyField source="license" dest="text"/>
- <copyField source="notes" dest="text"/>
- <copyField source="tags" dest="text"/>
- <copyField source="groups" dest="text"/>
- <copyField source="res_description" dest="text"/>
- <copyField source="maintainer" dest="text"/>
- <copyField source="author" dest="text"/>
+<uniqueKey>id</uniqueKey>
+<defaultSearchField>text</defaultSearchField>
+<solrQueryParser defaultOperator="AND"/>
+
+<copyField source="url" dest="urls"/>
+<copyField source="ckan_url" dest="urls"/>
+<copyField source="download_url" dest="urls"/>
+<copyField source="res_url" dest="urls"/>
+<copyField source="extras_*" dest="text"/>
+<copyField source="urls" dest="text"/>
+<copyField source="name" dest="text"/>
+<copyField source="title" dest="text"/>
+<copyField source="text" dest="text"/>
+<copyField source="license" dest="text"/>
+<copyField source="notes" dest="text"/>
+<copyField source="tags" dest="text"/>
+<copyField source="groups" dest="text"/>
+<copyField source="res_description" dest="text"/>
+<copyField source="maintainer" dest="text"/>
+<copyField source="author" dest="text"/></schema>
-


http://bitbucket.org/okfn/ckanext-solr/changeset/1704dcb0c7b6/
changeset:   1704dcb0c7b6
user:        John Glover
date:        2011-08-11 17:53:55
summary:     Update package search tests
affected #:  1 file (4.3 KB)

--- a/tests/test_package_search.py	Thu Aug 11 16:53:43 2011 +0100
+++ b/tests/test_package_search.py	Thu Aug 11 16:53:55 2011 +0100
@@ -1,4 +1,3 @@
-import solr
 from pylons import config
 from ckan.tests import TestController, CreateTestData
 from ckan import model
@@ -22,15 +21,13 @@
         del gils.tags[idx]
         model.repo.commit_and_remove()
         # solr
-        cls.solr = solr.SolrConnection(config.get('solr_url'))
-        cls.fq = " +site_id:\"%s\" " % config.get('ckan.site_id')
         search.rebuild()
         cls.backend = search.get_backend()
 
     @classmethod
     def teardown_class(cls):
         model.repo.rebuild_db()
-        cls.solr.close()
+        search.get_backend().index_for('Package').clear()
 
     def _pkg_names(self, result):
         return ' '.join(result['results'])
@@ -118,25 +115,6 @@
         result = self.backend.query_for(model.Package).run(query=u'tags:country-sweden tags:somethingrandom')
         assert self._pkg_names(result) == '', self._pkg_names(result)
 
-    # TODO: cannot search tags from solr. Should we index tags?
-
-    # def test_tag_basic(self):
-    #     result = self.backend.query_for('tag').run(query=u'gov')
-    #     assert result['count'] == 2, result
-    #     assert self._check_entity_names(result, ('gov', 'government')), self._pkg_names(result)
-
-    # def test_tag_basic_2(self):
-    #     result = self.backend.query_for('tag').run(query=u'wildlife')
-    #     assert self._pkg_names(result) == 'wildlife', self._pkg_names(result)
-
-    # def test_tag_with_tags_option(self):
-    #     result = self.backend.query_for('tag').run(query=u'tags:wildlife')
-    #     assert self._pkg_names(result) == 'wildlife', self._pkg_names(result)
-
-    # def test_tag_with_blank_tags(self):
-    #     result = self.backend.query_for('tag').run(query=u'tags: wildlife')
-    #     assert self._pkg_names(result) == 'wildlife', self._pkg_names(result)
-
     def test_pagination(self):
         # large search
         all_results = self.backend.query_for(model.Package).run(query=self.q_all)
@@ -170,3 +148,118 @@
         pkgs = result['results']
         assert len(pkgs) == 2, pkgs
         assert pkgs == all_pkgs[4:6]
+
+    def test_order_by(self):
+        # large search
+        all_results = self.backend.query_for(model.Package).run(query=self.q_all)
+        all_pkgs = all_results['results']
+        all_pkg_count = all_results['count']
+
+        # rank
+        # TODO: fix this test
+        # options = search.QueryOptions()
+        # options.order_by = 'rank'
+        # result = self.backend.query_for(model.Package).run(query='penguin', options=options)
+        # pkgs = result['results']
+        # fields = [model.Package.by_name(pkg_name).name for pkg_name in pkgs]
+        # assert fields[0] == 'usa-courts-gov', fields # has penguin three times
+        # assert pkgs == all_pkgs, pkgs #default ordering        
+
+        # name
+        options = search.QueryOptions()
+        options.order_by = 'name'
+        result = self.backend.query_for(model.Package).run(query=self.q_all, options=options)
+        pkgs = result['results']
+        fields = [model.Package.by_name(pkg_name).name for pkg_name in pkgs]
+        sorted_fields = fields; sorted_fields.sort()
+        assert fields == sorted_fields, repr(fields) + repr(sorted_fields)
+
+        # title
+        options = search.QueryOptions()
+        options.order_by = 'title'
+        result = self.backend.query_for(model.Package).run(query=self.q_all, options=options)
+        pkgs = result['results']
+        fields = [model.Package.by_name(pkg_name).title for pkg_name in pkgs]
+        sorted_fields = fields; sorted_fields.sort()
+        assert fields == sorted_fields, repr(fields) + repr(sorted_fields)
+
+        # notes
+        options = search.QueryOptions()
+        options.order_by = 'notes'
+        result = self.backend.query_for(model.Package).run(query=self.q_all, options=options)
+        pkgs = result['results']
+        fields = [model.Package.by_name(pkg_name).notes for pkg_name in pkgs]
+        sorted_fields = fields; sorted_fields.sort()
+        assert fields == sorted_fields, repr(fields) + repr(sorted_fields)
+
+        # extra field
+        # TODO: Fix this test
+        # options = SearchOptions({'q':self.q_all})
+        # options.order_by = 'date_released'
+        # result = Search().run(options)
+        # pkgs = result['results']
+        # fields = [model.Package.by_name(pkg_name).extras.get('date_released') for pkg_name in pkgs]
+        # sorted_fields = fields; sorted_fields.sort()
+        # assert fields == sorted_fields, repr(fields) + repr(sorted_fields)
+
+    def test_search_notes_on(self):
+        result = self.backend.query_for(model.Package).run(query=u'restrictions')
+        pkgs = result['results']
+        count = result['count']
+        assert len(pkgs) == 2, pkgs
+        
+    def test_search_foreign_chars(self):
+        result = self.backend.query_for(model.Package).run(query='umlaut')
+        assert result['results'] == ['gils'], result['results']
+        result = self.backend.query_for(model.Package).run(query=u'thumb')
+        assert result['count'] == 0, result['results']
+        result = self.backend.query_for(model.Package).run(query=u'th\xfcmb')
+        assert result['results'] == ['gils'], result['results']
+
+    def test_groups(self):
+        result = self.backend.query_for(model.Package).run(query=u'groups:random')
+        assert self._pkg_names(result) == '', self._pkg_names(result)
+        result = self.backend.query_for(model.Package).run(query=u'groups:ukgov')
+        assert result['count'] == 4, self._pkg_names(result)
+        result = self.backend.query_for(model.Package).run(query=u'groups:ukgov tags:us')
+        assert result['count'] == 2, self._pkg_names(result)
+
+class TestSearchOverall(TestController):
+    @classmethod
+    def setup_class(cls):
+        CreateTestData.create()
+        search.rebuild()
+        cls.backend = search.get_backend()
+
+    @classmethod
+    def teardown_class(cls):
+        model.repo.rebuild_db()
+        search.get_backend().index_for('Package').clear()
+
+    def _check_search_results(self, terms, expected_count, expected_packages=[], only_open=False, only_downloadable=False):
+        options = search.QueryOptions()
+        options.filter_by_openness = only_open
+        options.filter_by_downloadable = only_downloadable
+        result = self.backend.query_for(model.Package).run(query=unicode(terms))
+        pkgs = result['results']
+        count = result['count']
+        assert count == expected_count, (count, expected_count)
+        for expected_pkg in expected_packages:
+            assert expected_pkg in pkgs, '%s : %s' % (expected_pkg, result)
+
+    def test_overall(self):
+        self._check_search_results('annakarenina', 1, ['annakarenina'])
+        self._check_search_results('warandpeace', 1, ['warandpeace'])
+        self._check_search_results('', 2)
+        self._check_search_results('A Novel By Tolstoy', 1, ['annakarenina'])
+        self._check_search_results('title:Novel', 1, ['annakarenina'])
+        self._check_search_results('title:peace', 0)
+        self._check_search_results('name:warandpeace', 1)
+        self._check_search_results('groups:david', 2)
+        self._check_search_results('groups:roger', 1)
+        self._check_search_results('groups:lenny', 0)
+        self._check_search_results('annakarenina', 1, ['annakarenina'], True, False)
+        self._check_search_results('annakarenina', 1, ['annakarenina'], False, True)
+        self._check_search_results('annakarenina', 1, ['annakarenina'], True, True)
+        
+

Repository URL: https://bitbucket.org/okfn/ckanext-solr/

--

This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.




More information about the ckan-changes mailing list