from django.shortcuts import render from django.http import HttpResponseRedirect, Http404 from django.views.decorators.csrf import csrf_exempt from django.conf import settings from pgweb.util.decorators import cache, queryparams import urllib.parse import requests import psycopg2 from pgweb.lists.models import MailingList # Conditionally import memcached library. Everything will work without # it, so we allow development installs to run without it... try: import pylibmc has_memcached = True except Exception as e: has_memcached = False def generate_pagelinks(pagenum, totalpages, querystring): # Generate a list of links to page through a search result # We generate these in HTML from the python code because it's # simply too ugly to try to do it in the template. if totalpages < 2: return if pagenum > 1: # Prev link yield 'Prev' % (querystring, pagenum - 1) if pagenum > 10: start = pagenum - 10 else: start = 1 for i in range(start, min(start + 20, totalpages + 1)): if i == pagenum: yield "%s" % i else: yield '%s' % (querystring, i, i) if pagenum != min(start + 20, totalpages): yield 'Next' % (querystring, pagenum + 1) @csrf_exempt @queryparams('d', 'l', 'ln', 'm', 'p', 'q', 's', 'u') @cache(minutes=30) def search(request): # Perform a general web search # Since this lives in a different database, we open a direct # connection with psycopg, thus bypassing everything that has to do # with django. # constants that we might eventually want to make configurable hitsperpage = 20 if request.GET.get('m', '') == '1': searchlists = True if request.GET.get('l', '') != '': try: listid = int(request.GET['l']) if listid >= 0: # Make sure the list exists if not MailingList.objects.filter(id=listid).exists(): raise Http404() else: # Negative means it's a group, so verify that it exists if not MailingList.objects.filter(group=-listid).exists(): raise Http404() except ValueError: # If it's not an integer we just don't care listid = None else: # Listid not specified. But do we have the name? if 'ln' in request.GET: try: ll = MailingList.objects.get(listname=request.GET['ln']) listid = ll.id except MailingList.DoesNotExist: # Invalid list name just resets the default of the form, # no need to throw an error. listid = None else: listid = None if 'd' in request.GET: try: dateval = int(request.GET['d']) except Exception as e: dateval = None else: dateval = None if 's' in request.GET: listsort = request.GET['s'] if listsort not in ('r', 'd', 'i'): listsort = 'r' else: listsort = 'r' if not dateval: dateval = 365 sortoptions = ( {'val': 'r', 'text': 'Rank', 'selected': request.GET.get('s', '') not in ('d', 'i')}, {'val': 'd', 'text': 'Date', 'selected': request.GET.get('s', '') == 'd'}, {'val': 'i', 'text': 'Reverse date', 'selected': request.GET.get('s', '') == 'i'}, ) dateoptions = ( {'val': -1, 'text': 'anytime'}, {'val': 1, 'text': 'within last day'}, {'val': 7, 'text': 'within last week'}, {'val': 31, 'text': 'within last month'}, {'val': 186, 'text': 'within last 6 months'}, {'val': 365, 'text': 'within last year'}, ) else: searchlists = False suburl = request.GET.get('u', None) # Check that we actually have something to search for if request.GET.get('q', '') == '': if searchlists: return render(request, 'search/listsearch.html', { 'search_error': "No search term specified.", 'sortoptions': sortoptions, 'lists': MailingList.objects.all().order_by("group__sortkey"), 'listid': listid, 'dates': dateoptions, 'dateval': dateval, 'archives_root': settings.ARCHIVES_FRONT_ADDRESS, }) else: return render(request, 'search/sitesearch.html', { 'search_error': "No search term specified.", }) query = request.GET['q'].strip() if '\0' in query or ((not searchlists) and suburl and '\0' in suburl): return render(request, 'search/sitesearch.html', { 'search_error': "Invalid character in search.", }) # Anti-stefan prevention if len(query) > 1000: return render(request, 'search/sitesearch.html', { 'search_error': "Search term too long.", }) # Is the request being paged? try: pagenum = int(request.GET.get('p', 1)) except Exception as e: pagenum = 1 firsthit = (pagenum - 1) * hitsperpage + 1 if searchlists: # Lists are searched by passing the work down using a http # API. In the future, we probably want to do everything # through a http API and merge hits, but that's for later p = { 'q': query.encode('utf-8'), 's': listsort, } if listid: if listid < 0: # This is a list group, we expand that on the web server p['ln'] = ','.join([x.listname for x in MailingList.objects.filter(group=-listid)]) else: p['ln'] = MailingList.objects.get(pk=listid).listname if dateval: p['d'] = dateval urlstr = urllib.parse.urlencode(p) # If memcached is available, let's try it hits = None if has_memcached: memc = pylibmc.Client(['127.0.0.1', ], binary=True) # behavior not supported on pylibmc in squeeze:: behaviors={'tcp_nodelay':True}) try: hits = memc.get(urlstr) except Exception: # If we had an exception, don't try to store either memc = None if not hits: # No hits found - so try to get them from the search server try: r = requests.post( "{}://{}/archives-search/".format(settings.ARCHIVES_SEARCH_PLAINTEXT and 'http' or 'https', settings.ARCHIVES_SEARCH_SERVER), urlstr, headers={ 'Content-type': 'application/x-www-form-urlencoded; charset=utf-8', }, timeout=5, ) except requests.exceptions.Timeout: return render(request, 'search/listsearch.html', { 'search_error': 'Timeout when talking to search server. Please try your search again later, or with a more restrictive search terms.', }) except Exception as e: return render(request, 'search/listsearch.html', { 'search_error': 'General error when talking to search server.', }) if r.status_code != 200: memc = None return render(request, 'search/listsearch.html', { 'search_error': 'Error talking to search server: %s' % r.reason, }) hits = r.json() if has_memcached and memc: # Store them in memcached too! But only for 10 minutes... # And always compress it, just because we can memc.set(urlstr, hits, 60 * 10, 1) memc = None if isinstance(hits, dict): # This is not just a list of hits. # Right now the only supported dict result is a messageid # match, but make sure that's what it is. if hits['messageidmatch'] == 1: return HttpResponseRedirect("/message-id/%s" % query) totalhits = len(hits) querystr = "?m=1&q=%s&l=%s&d=%s&s=%s" % ( urllib.parse.quote_plus(query.encode('utf-8')), listid or '', dateval, listsort ) return render(request, 'search/listsearch.html', { 'hitcount': totalhits, 'firsthit': firsthit, 'lasthit': min(totalhits, firsthit + hitsperpage - 1), 'query': request.GET['q'], 'archives_root': settings.ARCHIVES_FRONT_ADDRESS, 'pagelinks': " ".join( generate_pagelinks(pagenum, (totalhits - 1) // hitsperpage + 1, querystr)), 'hits': [{ 'date': h['d'], 'subject': h['s'], 'author': h['f'], 'messageid': h['m'], 'abstract': h['a'], 'rank': h['r'], } for h in hits[firsthit - 1:firsthit + hitsperpage - 1]], 'sortoptions': sortoptions, 'lists': MailingList.objects.all().order_by("group__sortkey"), 'listid': listid, 'dates': dateoptions, 'dateval': dateval, }) else: # Website search is still done by making a regular pgsql connection # to the search server. try: conn = psycopg2.connect(settings.SEARCH_DSN) curs = conn.cursor() except Exception as e: return render(request, 'search/sitesearch.html', { 'search_error': 'Could not connect to search database.' }) # This is kind of a hack, but... Some URLs are flagged as internal # and should as such only be included in searches that explicitly # reference the suburl that they are in. if suburl and suburl.startswith('/docs/devel'): include_internal = True else: include_internal = False # perform the query for general web search try: curs.execute("SELECT * FROM site_search(%(query)s, %(firsthit)s, %(hitsperpage)s, %(suburl)s, %(internal)s)", { 'query': query, 'firsthit': firsthit - 1, 'hitsperpage': hitsperpage, 'suburl': suburl, 'internal': include_internal, }) except psycopg2.ProgrammingError: return render(request, 'search/sitesearch.html', { 'search_error': 'Error executing search query.' }) hits = curs.fetchall() conn.close() totalhits = int(hits[-1][5]) try: if suburl: quoted_suburl = urllib.parse.quote_plus(suburl) else: quoted_suburl = '' except Exception as e: quoted_suburl = '' querystr = "?q=%s&u=%s" % ( urllib.parse.quote_plus(query.encode('utf-8')), quoted_suburl, ) return render(request, 'search/sitesearch.html', { 'suburl': suburl, 'hitcount': totalhits, 'firsthit': firsthit, 'lasthit': min(totalhits, firsthit + hitsperpage - 1), 'query': request.GET['q'], 'pagelinks': " ".join( generate_pagelinks(pagenum, (totalhits - 1) // hitsperpage + 1, querystr)), 'hits': [{ 'title': h[3], 'url': "%s%s" % (h[1], h[2]), 'abstract': h[4].replace("[[[[[[", "").replace("]]]]]]", ""), 'rank': h[5]} for h in hits[:-1]], })