1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
|
import re
# Functions for working with shortened posts
# This does not appear to match everything in any shape or form, but we are only
# using it against URLs that we have typed in ourselves, so it should be easy
# enough.
# Should be in sync with regexp in js/admin.js
_re_urlmatcher = re.compile(r'\bhttps?://\S+', re.I)
# This is currently the value for Twitter and the default for Mastodon, so just
# use that globally for now.
url_shortened_len = 23
_url_counts_as_characters = "https://short.url/{}".format((url_shortened_len - len("https://short.url/")) * 'x')
def get_shortened_post_length(txt):
return len(_re_urlmatcher.sub(_url_counts_as_characters, txt))
# Truncate a text, taking into account URL shorterners. WIll not truncate in the middle of an URL,
# but right now will happily truncate in the middle of a word (room for improvement!)
def truncate_shortened_post(txt, maxlen):
matches = list(_re_urlmatcher.finditer(txt))
if not matches:
# Not a single url, so just truncate
return txt[:maxlen]
firststart, firstend = matches[0].span()
if firststart + url_shortened_len > maxlen:
# We hit the size limit before the url or in the middle of it, so skip the whole url
return txt[:firststart]
inlen = firstend
outlen = firststart + url_shortened_len
for i, curr in enumerate(matches[1:]):
prevstart, prevend = matches[i].span()
currstart, currend = curr.span()
betweenlen = currstart - prevend
if outlen + betweenlen > maxlen:
# The limit was hit in the text between urls
left = maxlen - outlen
return txt[:inlen + (maxlen - outlen)]
if outlen + betweenlen + url_shortened_len > maxlen:
# The limit was hit in the middle of this URL, so include all the text
# up to it, but skip the url.
return txt[:inlen + betweenlen]
# The whole URL fit
inlen += betweenlen + currend - currstart
outlen += betweenlen + url_shortened_len
return txt[:inlen + (maxlen - outlen)]
|