from django.shortcuts import render
from django.http import HttpResponseRedirect, Http404
from django.views.decorators.csrf import csrf_exempt
from django.conf import settings
from pgweb.util.decorators import cache, queryparams
import urllib.parse
import requests
import psycopg2
from pgweb.lists.models import MailingList
# Conditionally import memcached library. Everything will work without
# it, so we allow development installs to run without it...
try:
import pylibmc
has_memcached = True
except Exception as e:
has_memcached = False
def generate_pagelinks(pagenum, totalpages, querystring):
# Generate a list of links to page through a search result
# We generate these in HTML from the python code because it's
# simply too ugly to try to do it in the template.
if totalpages < 2:
return
if pagenum > 1:
# Prev link
yield 'Prev' % (querystring, pagenum - 1)
if pagenum > 10:
start = pagenum - 10
else:
start = 1
for i in range(start, min(start + 20, totalpages + 1)):
if i == pagenum:
yield "%s" % i
else:
yield '%s' % (querystring, i, i)
if pagenum != min(start + 20, totalpages):
yield 'Next' % (querystring, pagenum + 1)
@csrf_exempt
@queryparams('d', 'l', 'ln', 'm', 'p', 'q', 's', 'u')
@cache(minutes=30)
def search(request):
# Perform a general web search
# Since this lives in a different database, we open a direct
# connection with psycopg, thus bypassing everything that has to do
# with django.
# constants that we might eventually want to make configurable
hitsperpage = 20
if request.GET.get('m', '') == '1':
searchlists = True
if request.GET.get('l', '') != '':
try:
listid = int(request.GET['l'])
if listid >= 0:
# Make sure the list exists
if not MailingList.objects.filter(id=listid).exists():
raise Http404()
else:
# Negative means it's a group, so verify that it exists
if not MailingList.objects.filter(group=-listid).exists():
raise Http404()
except ValueError:
# If it's not an integer we just don't care
listid = None
else:
# Listid not specified. But do we have the name?
if 'ln' in request.GET:
try:
ll = MailingList.objects.get(listname=request.GET['ln'])
listid = ll.id
except MailingList.DoesNotExist:
# Invalid list name just resets the default of the form,
# no need to throw an error.
listid = None
else:
listid = None
if 'd' in request.GET:
try:
dateval = int(request.GET['d'])
except Exception as e:
dateval = None
else:
dateval = None
if 's' in request.GET:
listsort = request.GET['s']
if listsort not in ('r', 'd', 'i'):
listsort = 'r'
else:
listsort = 'r'
if not dateval:
dateval = 365
sortoptions = (
{'val': 'r', 'text': 'Rank', 'selected': request.GET.get('s', '') not in ('d', 'i')},
{'val': 'd', 'text': 'Date', 'selected': request.GET.get('s', '') == 'd'},
{'val': 'i', 'text': 'Reverse date', 'selected': request.GET.get('s', '') == 'i'},
)
dateoptions = (
{'val': -1, 'text': 'anytime'},
{'val': 1, 'text': 'within last day'},
{'val': 7, 'text': 'within last week'},
{'val': 31, 'text': 'within last month'},
{'val': 186, 'text': 'within last 6 months'},
{'val': 365, 'text': 'within last year'},
)
else:
searchlists = False
suburl = request.GET.get('u', None)
# Check that we actually have something to search for
if request.GET.get('q', '') == '':
if searchlists:
return render(request, 'search/listsearch.html', {
'search_error': "No search term specified.",
'sortoptions': sortoptions,
'lists': MailingList.objects.all().order_by("group__sortkey"),
'listid': listid,
'dates': dateoptions,
'dateval': dateval,
'archives_root': settings.ARCHIVES_FRONT_ADDRESS,
})
else:
return render(request, 'search/sitesearch.html', {
'search_error': "No search term specified.",
})
query = request.GET['q'].strip()
if '\0' in query or ((not searchlists) and suburl and '\0' in suburl):
return render(request, 'search/sitesearch.html', {
'search_error': "Invalid character in search.",
})
# Anti-stefan prevention
if len(query) > 1000:
return render(request, 'search/sitesearch.html', {
'search_error': "Search term too long.",
})
# Is the request being paged?
try:
pagenum = int(request.GET.get('p', 1))
except Exception as e:
pagenum = 1
firsthit = (pagenum - 1) * hitsperpage + 1
if searchlists:
# Lists are searched by passing the work down using a http
# API. In the future, we probably want to do everything
# through a http API and merge hits, but that's for later
p = {
'q': query.encode('utf-8'),
's': listsort,
}
if listid:
if listid < 0:
# This is a list group, we expand that on the web server
p['ln'] = ','.join([x.listname for x in MailingList.objects.filter(group=-listid)])
else:
p['ln'] = MailingList.objects.get(pk=listid).listname
if dateval:
p['d'] = dateval
urlstr = urllib.parse.urlencode(p)
# If memcached is available, let's try it
hits = None
if has_memcached:
memc = pylibmc.Client(['127.0.0.1', ], binary=True)
# behavior not supported on pylibmc in squeeze:: behaviors={'tcp_nodelay':True})
try:
hits = memc.get(urlstr)
except Exception:
# If we had an exception, don't try to store either
memc = None
if not hits:
# No hits found - so try to get them from the search server
try:
r = requests.post(
"{}://{}/archives-search/".format(settings.ARCHIVES_SEARCH_PLAINTEXT and 'http' or 'https', settings.ARCHIVES_SEARCH_SERVER),
urlstr,
headers={
'Content-type': 'application/x-www-form-urlencoded; charset=utf-8',
},
timeout=5,
)
except requests.exceptions.Timeout:
return render(request, 'search/listsearch.html', {
'search_error': 'Timeout when talking to search server. Please try your search again later, or with a more restrictive search terms.',
})
except Exception as e:
return render(request, 'search/listsearch.html', {
'search_error': 'General error when talking to search server.',
})
if r.status_code != 200:
memc = None
return render(request, 'search/listsearch.html', {
'search_error': 'Error talking to search server: %s' % r.reason,
})
hits = r.json()
if has_memcached and memc:
# Store them in memcached too! But only for 10 minutes...
# And always compress it, just because we can
memc.set(urlstr, hits, 60 * 10, 1)
memc = None
if isinstance(hits, dict):
# This is not just a list of hits.
# Right now the only supported dict result is a messageid
# match, but make sure that's what it is.
if hits['messageidmatch'] == 1:
return HttpResponseRedirect("/message-id/%s" % query)
totalhits = len(hits)
querystr = "?m=1&q=%s&l=%s&d=%s&s=%s" % (
urllib.parse.quote_plus(query.encode('utf-8')),
listid or '',
dateval,
listsort
)
return render(request, 'search/listsearch.html', {
'hitcount': totalhits,
'firsthit': firsthit,
'lasthit': min(totalhits, firsthit + hitsperpage - 1),
'query': request.GET['q'],
'archives_root': settings.ARCHIVES_FRONT_ADDRESS,
'pagelinks': " ".join(
generate_pagelinks(pagenum,
(totalhits - 1) // hitsperpage + 1,
querystr)),
'hits': [{
'date': h['d'],
'subject': h['s'],
'author': h['f'],
'messageid': h['m'],
'abstract': h['a'],
'rank': h['r'],
} for h in hits[firsthit - 1:firsthit + hitsperpage - 1]],
'sortoptions': sortoptions,
'lists': MailingList.objects.all().order_by("group__sortkey"),
'listid': listid,
'dates': dateoptions,
'dateval': dateval,
})
else:
# Website search is still done by making a regular pgsql connection
# to the search server.
try:
conn = psycopg2.connect(settings.SEARCH_DSN)
curs = conn.cursor()
except Exception as e:
return render(request, 'search/sitesearch.html', {
'search_error': 'Could not connect to search database.'
})
# This is kind of a hack, but... Some URLs are flagged as internal
# and should as such only be included in searches that explicitly
# reference the suburl that they are in.
if suburl and suburl.startswith('/docs/devel'):
include_internal = True
else:
include_internal = False
# perform the query for general web search
try:
curs.execute("SELECT * FROM site_search(%(query)s, %(firsthit)s, %(hitsperpage)s, %(suburl)s, %(internal)s)", {
'query': query,
'firsthit': firsthit - 1,
'hitsperpage': hitsperpage,
'suburl': suburl,
'internal': include_internal,
})
except psycopg2.ProgrammingError:
return render(request, 'search/sitesearch.html', {
'search_error': 'Error executing search query.'
})
hits = curs.fetchall()
conn.close()
totalhits = int(hits[-1][5])
try:
if suburl:
quoted_suburl = urllib.parse.quote_plus(suburl)
else:
quoted_suburl = ''
except Exception as e:
quoted_suburl = ''
querystr = "?q=%s&u=%s" % (
urllib.parse.quote_plus(query.encode('utf-8')),
quoted_suburl,
)
return render(request, 'search/sitesearch.html', {
'suburl': suburl,
'hitcount': totalhits,
'firsthit': firsthit,
'lasthit': min(totalhits, firsthit + hitsperpage - 1),
'query': request.GET['q'],
'pagelinks': " ".join(
generate_pagelinks(pagenum,
(totalhits - 1) // hitsperpage + 1,
querystr)),
'hits': [{
'title': h[3],
'url': "%s%s" % (h[1], h[2]),
'abstract': h[4].replace("[[[[[[", "").replace("]]]]]]", ""),
'rank': h[5]} for h in hits[:-1]],
})