diff options
| author | Tom Lane | 2010-04-28 02:04:16 +0000 |
|---|---|---|
| committer | Tom Lane | 2010-04-28 02:04:16 +0000 |
| commit | 2c265adea3129c917296b46a82786d67988ece2c (patch) | |
| tree | dbb37543101132b3b6fff6b5a33bbe53d7921897 /src/test | |
| parent | d64b110a64c407ca8f4b3d885a62c05b51711d14 (diff) | |
Modify the built-in text search parser to handle URLs more nearly according
to RFC 3986. In particular, these characters now terminate the path part
of a URL: '"', '<', '>', '\', '^', '`', '{', '|', '}'. The previous behavior
was inconsistent and depended on whether a "?" was present in the path.
Per gripe from Donald Fraser and spec research by Kevin Grittner.
This is a pre-existing bug, but not back-patching since the risks of
breaking existing applications seem to outweigh the benefits.
Diffstat (limited to 'src/test')
| -rw-r--r-- | src/test/regress/expected/tsearch.out | 52 | ||||
| -rw-r--r-- | src/test/regress/sql/tsearch.sql | 6 |
2 files changed, 52 insertions, 6 deletions
diff --git a/src/test/regress/expected/tsearch.out b/src/test/regress/expected/tsearch.out index 1cd9186d692..86ea5efc7ba 100644 --- a/src/test/regress/expected/tsearch.out +++ b/src/test/regress/expected/tsearch.out @@ -287,8 +287,10 @@ SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.w 6 | 4aew.werc.ewr 12 | 14 | http:// + 5 | 5aew.werc.ewr:8100/? 6 | 5aew.werc.ewr:8100 - 12 | /? + 18 | /? + 12 | 1 | ad 12 | = 1 | qwe @@ -391,14 +393,14 @@ SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.w 12 | 12 | <> 1 | qwerty -(131 rows) +(133 rows) SELECT to_tsvector('english', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>"> /usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234 <i <b> wow < jqw <> qwerty'); - to_tsvector --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - '+4.0e-10':26 '-4.2':58,60 '/?ad=qwe&dw':7,10,14,22 '/?ad=qwe&dw=%20%32':25 '/awdf/dwqe/4325':46 '/usr/local/fff':45 '/wqe-324/ewr':49 '1aew.werc.ewr':9 '1aew.werc.ewr/?ad=qwe&dw':8 '234':61 '234.435':30 '2aew.werc.ewr':11 '345':1 '3aew.werc.ewr':13 '3aew.werc.ewr/?ad=qwe&dw':12 '4.2':54,55,56 '455':31 '4aew.werc.ewr':15 '5.005':32 '5aew.werc.ewr:8100':16 '6aew.werc.ewr:8100':21 '6aew.werc.ewr:8100/?ad=qwe&dw':20 '7aew.werc.ewr:8100':24 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':23 'ad':17 'aew.werc.ewr':6 'aew.werc.ewr/?ad=qwe&dw':5 'asdf':37 'dw':19 'efd.r':3 'ewr1':43 'ewri2':44 'gist.c':52 'gist.h':50 'gist.h.c':51 'hjwer':42 'jf':39 'jqw':64 'qwe':2,18,27,28,35 'qwe-wer':34 'qwer':38 'qwerti':65 'qwqwe':29 'readlin':53,57,59 'rewt/ewr':47 'sdjk':40 'teodor@stack.net':33 'wefjn':48 'wer':36 'wow':63 'www.com':4 + to_tsvector +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ + '+4.0e-10':28 '-4.2':60,62 '/?':18 '/?ad=qwe&dw':7,10,14,24 '/?ad=qwe&dw=%20%32':27 '/awdf/dwqe/4325':48 '/usr/local/fff':47 '/wqe-324/ewr':51 '1aew.werc.ewr':9 '1aew.werc.ewr/?ad=qwe&dw':8 '234':63 '234.435':32 '2aew.werc.ewr':11 '345':1 '3aew.werc.ewr':13 '3aew.werc.ewr/?ad=qwe&dw':12 '4.2':56,57,58 '455':33 '4aew.werc.ewr':15 '5.005':34 '5aew.werc.ewr:8100':17 '5aew.werc.ewr:8100/?':16 '6aew.werc.ewr:8100':23 '6aew.werc.ewr:8100/?ad=qwe&dw':22 '7aew.werc.ewr:8100':26 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':25 'ad':19 'aew.werc.ewr':6 'aew.werc.ewr/?ad=qwe&dw':5 'asdf':39 'dw':21 'efd.r':3 'ewr1':45 'ewri2':46 'gist.c':54 'gist.h':52 'gist.h.c':53 'hjwer':44 'jf':41 'jqw':66 'qwe':2,20,29,30,37 'qwe-wer':36 'qwer':40 'qwerti':67 'qwqwe':31 'readlin':55,59,61 'rewt/ewr':49 'sdjk':42 'teodor@stack.net':35 'wefjn':50 'wer':38 'wow':65 'www.com':4 (1 row) SELECT length(to_tsvector('english', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>"> @@ -406,7 +408,7 @@ SELECT length(to_tsvector('english', '345 qwe@efd.r '' http://www.com/ http://ae <i <b> wow < jqw <> qwerty')); length -------- - 51 + 53 (1 row) -- ts_debug @@ -424,6 +426,44 @@ SELECT * from ts_debug('english', '<myns:foo-bar_baz.blurfl>abc&nm1;def©ghi tag | XML tag | </myns:foo-bar_baz.blurfl> | {} | | (9 rows) +-- check parsing of URLs +SELECT * from ts_debug('english', 'http://www.harewoodsolutions.co.uk/press.aspx</span>'); + alias | description | token | dictionaries | dictionary | lexemes +----------+---------------+----------------------------------------+--------------+------------+------------------------------------------ + protocol | Protocol head | http:// | {} | | + url | URL | www.harewoodsolutions.co.uk/press.aspx | {simple} | simple | {www.harewoodsolutions.co.uk/press.aspx} + host | Host | www.harewoodsolutions.co.uk | {simple} | simple | {www.harewoodsolutions.co.uk} + url_path | URL path | /press.aspx | {simple} | simple | {/press.aspx} + tag | XML tag | </span> | {} | | +(5 rows) + +SELECT * from ts_debug('english', 'http://aew.wer0c.ewr/id?ad=qwe&dw<span>'); + alias | description | token | dictionaries | dictionary | lexemes +----------+---------------+----------------------------+--------------+------------+------------------------------ + protocol | Protocol head | http:// | {} | | + url | URL | aew.wer0c.ewr/id?ad=qwe&dw | {simple} | simple | {aew.wer0c.ewr/id?ad=qwe&dw} + host | Host | aew.wer0c.ewr | {simple} | simple | {aew.wer0c.ewr} + url_path | URL path | /id?ad=qwe&dw | {simple} | simple | {/id?ad=qwe&dw} + tag | XML tag | <span> | {} | | +(5 rows) + +SELECT * from ts_debug('english', 'http://5aew.werc.ewr:8100/?'); + alias | description | token | dictionaries | dictionary | lexemes +----------+---------------+----------------------+--------------+------------+------------------------ + protocol | Protocol head | http:// | {} | | + url | URL | 5aew.werc.ewr:8100/? | {simple} | simple | {5aew.werc.ewr:8100/?} + host | Host | 5aew.werc.ewr:8100 | {simple} | simple | {5aew.werc.ewr:8100} + url_path | URL path | /? | {simple} | simple | {/?} +(4 rows) + +SELECT * from ts_debug('english', '5aew.werc.ewr:8100/?xx'); + alias | description | token | dictionaries | dictionary | lexemes +----------+-------------+------------------------+--------------+------------+-------------------------- + url | URL | 5aew.werc.ewr:8100/?xx | {simple} | simple | {5aew.werc.ewr:8100/?xx} + host | Host | 5aew.werc.ewr:8100 | {simple} | simple | {5aew.werc.ewr:8100} + url_path | URL path | /?xx | {simple} | simple | {/?xx} +(3 rows) + -- to_tsquery SELECT to_tsquery('english', 'qwe & sKies '); to_tsquery diff --git a/src/test/regress/sql/tsearch.sql b/src/test/regress/sql/tsearch.sql index 3467b1f6de1..3c0a7dd82a8 100644 --- a/src/test/regress/sql/tsearch.sql +++ b/src/test/regress/sql/tsearch.sql @@ -105,6 +105,12 @@ SELECT length(to_tsvector('english', '345 qwe@efd.r '' http://www.com/ http://ae SELECT * from ts_debug('english', '<myns:foo-bar_baz.blurfl>abc&nm1;def©ghiõjkl</myns:foo-bar_baz.blurfl>'); +-- check parsing of URLs +SELECT * from ts_debug('english', 'http://www.harewoodsolutions.co.uk/press.aspx</span>'); +SELECT * from ts_debug('english', 'http://aew.wer0c.ewr/id?ad=qwe&dw<span>'); +SELECT * from ts_debug('english', 'http://5aew.werc.ewr:8100/?'); +SELECT * from ts_debug('english', '5aew.werc.ewr:8100/?xx'); + -- to_tsquery SELECT to_tsquery('english', 'qwe & sKies '); |
