pgweb/search/views.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323

from django.shortcuts import render
from django.http import HttpResponseRedirect, Http404
from django.views.decorators.csrf import csrf_exempt
from django.conf import settings

from pgweb.util.decorators import cache, queryparams

import urllib.parse
import requests
import psycopg2

from pgweb.lists.models import MailingList

# Conditionally import memcached library. Everything will work without
# it, so we allow development installs to run without it...
try:
    import pylibmc
    has_memcached = True
except Exception as e:
    has_memcached = False


def generate_pagelinks(pagenum, totalpages, querystring):
    # Generate a list of links to page through a search result
    # We generate these in HTML from the python code because it's
    # simply too ugly to try to do it in the template.
    if totalpages < 2:
        return

    if pagenum > 1:
        # Prev link
        yield '<a href="%s&p=%s">Prev</a>' % (querystring, pagenum - 1)

    if pagenum > 10:
        start = pagenum - 10
    else:
        start = 1

    for i in range(start, min(start + 20, totalpages + 1)):
        if i == pagenum:
            yield "%s" % i
        else:
            yield '<a href="%s&p=%s">%s</a>' % (querystring, i, i)

    if pagenum != min(start + 20, totalpages):
        yield '<a href="%s&p=%s">Next</a>' % (querystring, pagenum + 1)


@csrf_exempt
@queryparams('d', 'l', 'ln', 'm', 'p', 'q', 's', 'u')
@cache(minutes=30)
def search(request):
    # Perform a general web search
    # Since this lives in a different database, we open a direct
    # connection with psycopg, thus bypassing everything that has to do
    # with django.

    # constants that we might eventually want to make configurable
    hitsperpage = 20

    if request.GET.get('m', '') == '1':
        searchlists = True

        if request.GET.get('l', '') != '':
            try:
                listid = int(request.GET['l'])
                if listid >= 0:
                    # Make sure the list exists
                    if not MailingList.objects.filter(id=listid).exists():
                        raise Http404()
                else:
                    # Negative means it's a group, so verify that it exists
                    if not MailingList.objects.filter(group=-listid).exists():
                        raise Http404()
            except ValueError:
                # If it's not an integer we just don't care
                listid = None
        else:
            # Listid not specified. But do we have the name?
            if 'ln' in request.GET:
                try:
                    ll = MailingList.objects.get(listname=request.GET['ln'])
                    listid = ll.id
                except MailingList.DoesNotExist:
                    # Invalid list name just resets the default of the form,
                    # no need to throw an error.
                    listid = None
            else:
                listid = None

        if 'd' in request.GET:
            try:
                dateval = int(request.GET['d'])
            except Exception as e:
                dateval = None
        else:
            dateval = None

        if 's' in request.GET:
            listsort = request.GET['s']
            if listsort not in ('r', 'd', 'i'):
                listsort = 'r'
        else:
            listsort = 'r'

        if not dateval:
            dateval = 365

        sortoptions = (
            {'val': 'r', 'text': 'Rank', 'selected': request.GET.get('s', '') not in ('d', 'i')},
            {'val': 'd', 'text': 'Date', 'selected': request.GET.get('s', '') == 'd'},
            {'val': 'i', 'text': 'Reverse date', 'selected': request.GET.get('s', '') == 'i'},
        )
        dateoptions = (
            {'val': -1, 'text': 'anytime'},
            {'val': 1, 'text': 'within last day'},
            {'val': 7, 'text': 'within last week'},
            {'val': 31, 'text': 'within last month'},
            {'val': 186, 'text': 'within last 6 months'},
            {'val': 365, 'text': 'within last year'},
        )
    else:
        searchlists = False
        suburl = request.GET.get('u', None)

    # Check that we actually have something to search for
    if request.GET.get('q', '') == '':
        if searchlists:
            return render(request, 'search/listsearch.html', {
                'search_error': "No search term specified.",
                'sortoptions': sortoptions,
                'lists': MailingList.objects.all().order_by("group__sortkey"),
                'listid': listid,
                'dates': dateoptions,
                'dateval': dateval,
                'archives_root': settings.ARCHIVES_FRONT_ADDRESS,
            })
        else:
            return render(request, 'search/sitesearch.html', {
                'search_error': "No search term specified.",
            })
    query = request.GET['q'].strip()
    if '\0' in query or ((not searchlists) and suburl and '\0' in suburl):
        return render(request, 'search/sitesearch.html', {
            'search_error': "Invalid character in search.",
        })

    # Anti-stefan prevention
    if len(query) > 1000:
        return render(request, 'search/sitesearch.html', {
            'search_error': "Search term too long.",
        })

    # Is the request being paged?
    try:
        pagenum = int(request.GET.get('p', 1))
    except Exception as e:
        pagenum = 1

    firsthit = (pagenum - 1) * hitsperpage + 1

    if searchlists:
        # Lists are searched by passing the work down using a http
        # API. In the future, we probably want to do everything
        # through a http API and merge hits, but that's for later
        p = {
            'q': query.encode('utf-8'),
            's': listsort,
        }
        if listid:
            if listid < 0:
                # This is a list group, we expand that on the web server
                p['ln'] = ','.join([x.listname for x in MailingList.objects.filter(group=-listid)])
            else:
                p['ln'] = MailingList.objects.get(pk=listid).listname
        if dateval:
            p['d'] = dateval
        urlstr = urllib.parse.urlencode(p)
        # If memcached is available, let's try it
        hits = None
        if has_memcached:
            memc = pylibmc.Client(['127.0.0.1', ], binary=True)
            # behavior not supported on pylibmc in squeeze:: behaviors={'tcp_nodelay':True})
            try:
                hits = memc.get(urlstr)
            except Exception:
                # If we had an exception, don't try to store either
                memc = None
        if not hits:
            # No hits found - so try to get them from the search server
            try:
                r = requests.post(
                    "{}://{}/archives-search/".format(settings.ARCHIVES_SEARCH_PLAINTEXT and 'http' or 'https', settings.ARCHIVES_SEARCH_SERVER),
                    urlstr,
                    headers={
                        'Content-type': 'application/x-www-form-urlencoded; charset=utf-8',
                    },
                    timeout=5,
                )
            except requests.exceptions.Timeout:
                return render(request, 'search/listsearch.html', {
                    'search_error': 'Timeout when talking to search server. Please try your search again later, or with a more restrictive search terms.',
                })
            except Exception as e:
                return render(request, 'search/listsearch.html', {
                    'search_error': 'General error when talking to search server.',
                })
            if r.status_code != 200:
                memc = None
                return render(request, 'search/listsearch.html', {
                    'search_error': 'Error talking to search server: %s' % r.reason,
                })
            hits = r.json()
            if has_memcached and memc:
                # Store them in memcached too! But only for 10 minutes...
                # And always compress it, just because we can
                memc.set(urlstr, hits, 60 * 10, 1)
                memc = None

        if isinstance(hits, dict):
            # This is not just a list of hits.
            # Right now the only supported dict result is a messageid
            # match, but make sure that's what it is.
            if hits['messageidmatch'] == 1:
                return HttpResponseRedirect("/message-id/%s" % query)

        totalhits = len(hits)
        querystr = "?m=1&q=%s&l=%s&d=%s&s=%s" % (
            urllib.parse.quote_plus(query.encode('utf-8')),
            listid or '',
            dateval,
            listsort
        )

        return render(request, 'search/listsearch.html', {
            'hitcount': totalhits,
            'firsthit': firsthit,
            'lasthit': min(totalhits, firsthit + hitsperpage - 1),
            'query': request.GET['q'],
            'archives_root': settings.ARCHIVES_FRONT_ADDRESS,
            'pagelinks': "&nbsp;".join(
                generate_pagelinks(pagenum,
                                   (totalhits - 1) // hitsperpage + 1,
                                   querystr)),
            'hits': [{
                'date': h['d'],
                'subject': h['s'],
                'author': h['f'],
                'messageid': h['m'],
                'abstract': h['a'],
                'rank': h['r'],
            } for h in hits[firsthit - 1:firsthit + hitsperpage - 1]],
            'sortoptions': sortoptions,
            'lists': MailingList.objects.all().order_by("group__sortkey"),
            'listid': listid,
            'dates': dateoptions,
            'dateval': dateval,
        })

    else:
        # Website search is still done by making a regular pgsql connection
        # to the search server.
        try:
            conn = psycopg2.connect(settings.SEARCH_DSN)
            curs = conn.cursor()
        except Exception as e:
            return render(request, 'search/sitesearch.html', {
                'search_error': 'Could not connect to search database.'
            })

        # This is kind of a hack, but... Some URLs are flagged as internal
        # and should as such only be included in searches that explicitly
        # reference the suburl that they are in.
        if suburl and suburl.startswith('/docs/devel'):
            include_internal = True
        else:
            include_internal = False

        # perform the query for general web search
        try:
            curs.execute("SELECT * FROM site_search(%(query)s, %(firsthit)s, %(hitsperpage)s, %(suburl)s, %(internal)s)", {
                'query': query,
                'firsthit': firsthit - 1,
                'hitsperpage': hitsperpage,
                'suburl': suburl,
                'internal': include_internal,
            })
        except psycopg2.ProgrammingError:
            return render(request, 'search/sitesearch.html', {
                'search_error': 'Error executing search query.'
            })

        hits = curs.fetchall()
        conn.close()
        totalhits = int(hits[-1][5])
        try:
            if suburl:
                quoted_suburl = urllib.parse.quote_plus(suburl)
            else:
                quoted_suburl = ''
        except Exception as e:
            quoted_suburl = ''
        querystr = "?q=%s&u=%s" % (
            urllib.parse.quote_plus(query.encode('utf-8')),
            quoted_suburl,
        )

        return render(request, 'search/sitesearch.html', {
            'suburl': suburl,
            'hitcount': totalhits,
            'firsthit': firsthit,
            'lasthit': min(totalhits, firsthit + hitsperpage - 1),
            'query': request.GET['q'],
            'pagelinks': "&nbsp;".join(
                generate_pagelinks(pagenum,
                                   (totalhits - 1) // hitsperpage + 1,
                                   querystr)),
            'hits': [{
                'title': h[3],
                'url': "%s%s" % (h[1], h[2]),
                'abstract': h[4].replace("[[[[[[", "<strong>").replace("]]]]]]", "</strong>"),
                'rank': h[5]} for h in hits[:-1]],
        })