Skip to content

Commit cc3fa20

Browse files
PierreQuentelorsenthil
authored andcommitted
bpo-29979: Rewrite cgi.parse_multipart to make it consistent with FieldStorage (#991)
1 parent f34c685 commit cc3fa20

File tree

5 files changed

+38
-102
lines changed

5 files changed

+38
-102
lines changed

Doc/library/cgi.rst

+10-9
Original file line numberDiff line numberDiff line change
@@ -294,19 +294,20 @@ algorithms implemented in this module in other circumstances.
294294
This function is deprecated in this module. Use :func:`urllib.parse.parse_qsl`
295295
instead. It is maintained here only for backward compatibility.
296296

297-
.. function:: parse_multipart(fp, pdict)
297+
.. function:: parse_multipart(fp, pdict, encoding="utf-8")
298298

299299
Parse input of type :mimetype:`multipart/form-data` (for file uploads).
300-
Arguments are *fp* for the input file and *pdict* for a dictionary containing
301-
other parameters in the :mailheader:`Content-Type` header.
300+
Arguments are *fp* for the input file, *pdict* for a dictionary containing
301+
other parameters in the :mailheader:`Content-Type` header, and *encoding*,
302+
the request encoding.
302303

303-
Returns a dictionary just like :func:`urllib.parse.parse_qs` keys are the field names, each
304-
value is a list of values for that field. This is easy to use but not much good
305-
if you are expecting megabytes to be uploaded --- in that case, use the
306-
:class:`FieldStorage` class instead which is much more flexible.
304+
Returns a dictionary just like :func:`urllib.parse.parse_qs`: keys are the
305+
field names, each value is a list of values for that field. For non-file
306+
fields, the value is a list of strings.
307307

308-
Note that this does not parse nested multipart parts --- use
309-
:class:`FieldStorage` for that.
308+
This is easy to use but not much good if you are expecting megabytes to be
309+
uploaded --- in that case, use the :class:`FieldStorage` class instead
310+
which is much more flexible.
310311

311312

312313
.. function:: parse_header(string)

Doc/whatsnew/3.7.rst

+8
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,14 @@ New Modules
9595
Improved Modules
9696
================
9797

98+
cgi
99+
---
100+
101+
:func:`~cgi.parse_multipart` returns the same results as
102+
:class:`~FieldStorage` : for non-file fields, the value associated to a key
103+
is a list of strings, not bytes.
104+
(Contributed by Pierre Quentel in :issue:`29979`.)
105+
98106
binascii
99107
--------
100108

Lib/cgi.py

+14-91
Original file line numberDiff line numberDiff line change
@@ -198,105 +198,28 @@ def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
198198
DeprecationWarning, 2)
199199
return urllib.parse.parse_qsl(qs, keep_blank_values, strict_parsing)
200200

201-
def parse_multipart(fp, pdict):
201+
def parse_multipart(fp, pdict, encoding="utf-8"):
202202
"""Parse multipart input.
203203
204204
Arguments:
205205
fp : input file
206206
pdict: dictionary containing other parameters of content-type header
207+
encoding: request encoding
207208
208209
Returns a dictionary just like parse_qs(): keys are the field names, each
209-
value is a list of values for that field. This is easy to use but not
210-
much good if you are expecting megabytes to be uploaded -- in that case,
211-
use the FieldStorage class instead which is much more flexible. Note
212-
that content-type is the raw, unparsed contents of the content-type
213-
header.
214-
215-
XXX This does not parse nested multipart parts -- use FieldStorage for
216-
that.
217-
218-
XXX This should really be subsumed by FieldStorage altogether -- no
219-
point in having two implementations of the same parsing algorithm.
220-
Also, FieldStorage protects itself better against certain DoS attacks
221-
by limiting the size of the data read in one chunk. The API here
222-
does not support that kind of protection. This also affects parse()
223-
since it can call parse_multipart().
224-
210+
value is a list of values for that field. For non-file fields, the value
211+
is a list of strings.
225212
"""
226-
import http.client
227-
228-
boundary = b""
229-
if 'boundary' in pdict:
230-
boundary = pdict['boundary']
231-
if not valid_boundary(boundary):
232-
raise ValueError('Invalid boundary in multipart form: %r'
233-
% (boundary,))
234-
235-
nextpart = b"--" + boundary
236-
lastpart = b"--" + boundary + b"--"
237-
partdict = {}
238-
terminator = b""
239-
240-
while terminator != lastpart:
241-
bytes = -1
242-
data = None
243-
if terminator:
244-
# At start of next part. Read headers first.
245-
headers = http.client.parse_headers(fp)
246-
clength = headers.get('content-length')
247-
if clength:
248-
try:
249-
bytes = int(clength)
250-
except ValueError:
251-
pass
252-
if bytes > 0:
253-
if maxlen and bytes > maxlen:
254-
raise ValueError('Maximum content length exceeded')
255-
data = fp.read(bytes)
256-
else:
257-
data = b""
258-
# Read lines until end of part.
259-
lines = []
260-
while 1:
261-
line = fp.readline()
262-
if not line:
263-
terminator = lastpart # End outer loop
264-
break
265-
if line.startswith(b"--"):
266-
terminator = line.rstrip()
267-
if terminator in (nextpart, lastpart):
268-
break
269-
lines.append(line)
270-
# Done with part.
271-
if data is None:
272-
continue
273-
if bytes < 0:
274-
if lines:
275-
# Strip final line terminator
276-
line = lines[-1]
277-
if line[-2:] == b"\r\n":
278-
line = line[:-2]
279-
elif line[-1:] == b"\n":
280-
line = line[:-1]
281-
lines[-1] = line
282-
data = b"".join(lines)
283-
line = headers['content-disposition']
284-
if not line:
285-
continue
286-
key, params = parse_header(line)
287-
if key != 'form-data':
288-
continue
289-
if 'name' in params:
290-
name = params['name']
291-
else:
292-
continue
293-
if name in partdict:
294-
partdict[name].append(data)
295-
else:
296-
partdict[name] = [data]
297-
298-
return partdict
299-
213+
# RFC 2026, Section 5.1 : The "multipart" boundary delimiters are always
214+
# represented as 7bit US-ASCII.
215+
boundary = pdict['boundary'].decode('ascii')
216+
ctype = "multipart/form-data; boundary={}".format(boundary)
217+
headers = Message()
218+
headers.set_type(ctype)
219+
headers['Content-Length'] = pdict['CONTENT-LENGTH']
220+
fs = FieldStorage(fp, headers=headers, encoding=encoding,
221+
environ={'REQUEST_METHOD': 'POST'})
222+
return {k: fs.getlist(k) for k in fs}
300223

301224
def _parseparam(s):
302225
while s[:1] == ';':

Lib/test/test_cgi.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -126,8 +126,8 @@ def test_parse_multipart(self):
126126
env = {'boundary': BOUNDARY.encode('latin1'),
127127
'CONTENT-LENGTH': '558'}
128128
result = cgi.parse_multipart(fp, env)
129-
expected = {'submit': [b' Add '], 'id': [b'1234'],
130-
'file': [b'Testing 123.\n'], 'title': [b'']}
129+
expected = {'submit': [' Add '], 'id': ['1234'],
130+
'file': [b'Testing 123.\n'], 'title': ['']}
131131
self.assertEqual(result, expected)
132132

133133
def test_fieldstorage_properties(self):

Misc/NEWS

+4
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,10 @@ Extension Modules
317317
Library
318318
-------
319319

320+
- bpo-29979: rewrite cgi.parse_multipart, reusing the FieldStorage class and
321+
making its results consistent with those of FieldStorage for
322+
multipart/form-data requests. Patch by Pierre Quentel.
323+
320324
- bpo-30243: Removed the __init__ methods of _json's scanner and encoder.
321325
Misusing them could cause memory leaks or crashes. Now scanner and encoder
322326
objects are completely initialized in the __new__ methods.

0 commit comments

Comments
 (0)