Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(url): canonicalize_url('http://您好.中国:80/') failed #98

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions tests/test_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,19 +146,23 @@ def test_safe_url_idna(self):

# Japanese
(u'http://はじめよう.みんな/?query=サ&maxResults=5', 'http://xn--p8j9a0d9c9a.xn--q9jyb4c/?query=%E3%82%B5&maxResults=5'),
(u'http://はじめよう.みんな:80/?query=サ&maxResults=5', 'http://xn--p8j9a0d9c9a.xn--q9jyb4c:80/?query=%E3%82%B5&maxResults=5'),

# Russian
(u'http://кто.рф/', 'http://xn--j1ail.xn--p1ai/'),
(u'http://кто.рф/index.php?domain=Что', 'http://xn--j1ail.xn--p1ai/index.php?domain=%D0%A7%D1%82%D0%BE'),

# Korean
(u'http://내도메인.한국:80/', 'http://xn--220b31d95hq8o.xn--3e0b707e:80/'),
(u'http://내도메인.한국/', 'http://xn--220b31d95hq8o.xn--3e0b707e/'),
(u'http://맨체스터시티축구단.한국/', 'http://xn--2e0b17htvgtvj9haj53ccob62ni8d.xn--3e0b707e/'),

# Arabic
(u'http://nic.شبكة', 'http://nic.xn--ngbc5azd'),

# Chinese
(u'http://您好.中国/', 'http://xn--5usr0o.xn--fiqs8s/'),
(u'http://您好.中国:80/', 'http://xn--5usr0o.xn--fiqs8s:80/'),
(u'https://www.贷款.在线', 'https://www.xn--0kwr83e.xn--3ds443g'),
(u'https://www2.xn--0kwr83e.在线', 'https://www2.xn--0kwr83e.xn--3ds443g'),
(u'https://www3.贷款.xn--3ds443g', 'https://www3.xn--0kwr83e.xn--3ds443g'),
Expand Down Expand Up @@ -394,10 +398,15 @@ def test_typical_usage(self):
def test_port_number(self):
self.assertEqual(canonicalize_url("http://www.example.com:8888/do?a=1&b=2&c=3"),
"http://www.example.com:8888/do?a=1&b=2&c=3")

self.assertEqual(canonicalize_url(u'http://您好.中国:80/'), 'http://xn--5usr0o.xn--fiqs8s:80/')

# trailing empty ports are removed
self.assertEqual(canonicalize_url("http://www.example.com:/do?a=1&b=2&c=3"),
"http://www.example.com/do?a=1&b=2&c=3")

self.assertEqual(canonicalize_url(u'http://您好.中国:/'), 'http://xn--5usr0o.xn--fiqs8s/')

def test_sorting(self):
self.assertEqual(canonicalize_url("http://www.example.com/do?c=3&b=5&b=2&a=50"),
"http://www.example.com/do?a=50&b=2&b=5&c=3")
Expand Down Expand Up @@ -522,10 +531,17 @@ def test_domains_are_case_insensitive(self):
def test_canonicalize_idns(self):
self.assertEqual(canonicalize_url(u'http://www.bücher.de?q=bücher'),
'http://www.xn--bcher-kva.de/?q=b%C3%BCcher')

self.assertEqual(canonicalize_url(u'http://www.bücher.de:80?q=bücher'),
'http://www.xn--bcher-kva.de:80/?q=b%C3%BCcher')

# Japanese (+ reordering query parameters)
self.assertEqual(canonicalize_url(u'http://はじめよう.みんな/?query=サ&maxResults=5'),
'http://xn--p8j9a0d9c9a.xn--q9jyb4c/?maxResults=5&query=%E3%82%B5')

self.assertEqual(canonicalize_url(u'http://はじめよう.みんな:80/?query=サ&maxResults=5'),
'http://xn--p8j9a0d9c9a.xn--q9jyb4c:80/?maxResults=5&query=%E3%82%B5')

def test_quoted_slash_and_question_sign(self):
self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC+rocks%3f/?yeah=1"),
"http://foo.com/AC%2FDC+rocks%3F/?yeah=1")
Expand Down
32 changes: 24 additions & 8 deletions w3lib/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,28 @@
from w3lib.util import to_bytes, to_native_str, to_unicode


def _encode_netloc(onetloc):
"""
:type onetloc: unicode
:rtype: unicode
"""
try:
idx = onetloc.rfind(u':')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the tests.
My #98 (comment) still holds though.
Have you considered using SplitResult .hostname and .port attributes?

Copy link
Author

@hidva hidva Oct 28, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Current behavior:

# Calling this function on an already "safe" URL will return the URL unmodified.
>>> w3lib.url.safe_url_string('https://www.baIdu.com')
'https://www.baIdu.com'  # I
>>> w3lib.url.canonicalize_url('http://www.baidu.com:80000000')
'http://www.baidu.com:80000000/'

when _encode_netloc() using SplitResult .hostname and .port attributes:

>>> w3lib.url.safe_url_string('https://www.baIdu.com')
'https://www.baidu.com'
>>> w3lib.url.canonicalize_url('http://www.baidu.com:80000000')
'http://www.baidu.com/'

because python 3.5 says:

hostname is Host name (lower case)...

Is this what we want?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm fine with lowercasing the hostname. But it is indeed a change in behavior.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

>>> w3lib.url.canonicalize_url('http://www.baidu.com:80000000')
'http://www.baidu.com/'

what's the reason for the "lost" port number?

Copy link
Author

@hidva hidva Oct 28, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

python 3.5 says: port is None when value if not valid:

Python 2.7.12 (default, Nov 19 2016, 06:48:10) 
[GCC 5.4.0 20160609] on linux2
Type "help", "copyright", "credits" or "license" for more information.
>>> from six.moves.urllib.parse import (urljoin, urlsplit, urlunsplit,
...                                     urldefrag, urlencode, urlparse,
...                                     quote, parse_qs, parse_qsl,
...                                     ParseResult, unquote, urlunparse)
>>> 
>>> t = urlparse('http://www.BaidU.com:80')  # valid port
>>> t.hostname, t.port
('www.baidu.com', 80)
>>> t = urlparse('http://www.BaidU.com:65536')  # invalid port
>>> t.hostname, t.port
('www.baidu.com', None)

And the new _encode_netloc():

def _encode_netloc(parts):
    """
    :type parts: ParseResult
    :rtype: unicode
    """
    try:
        hostname = to_unicode(parts.hostname.encode('idna'))
        netloc = hostname if parts.port is None else '%s:%s' % (hostname, parts.port)
        # lost the port number if parts.port is None
    except UnicodeError:
        netloc = parts.netloc
    return netloc

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a shame that urlsplit and urlparse in Python 2.7 and 3.5 simply swallow an invalid port number.
Python 3.6 does report an error:

$ python
Python 3.6.2 (default, Aug 24 2017, 10:48:24) 
[GCC 6.3.0 20170406] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> from urllib.parse import urlsplit
>>> t = urlsplit('http://www.BaidU.com:654444')
>>> t.port
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/usr/local/lib/python3.6/urllib/parse.py", line 169, in port
    raise ValueError("Port out of range 0-65535")
ValueError: Port out of range 0-65535

if idx != -1:
hostname = onetloc[:idx]
portpart = onetloc[idx:]
else:
hostname = onetloc
portpart = u''
# assert isinstance(hostname, unicode)
# assert isinstance(portpart, unicode)
hostname = to_unicode(hostname.encode('idna'))
netloc = hostname + portpart
except UnicodeError:
netloc = onetloc
return netloc


# error handling function for bytes-to-Unicode decoding errors with URLs
def _quote_byte(error):
return (to_unicode(quote(error.object[error.start:error.end])), error.end)
Expand Down Expand Up @@ -61,10 +83,7 @@ def safe_url_string(url, encoding='utf8', path_encoding='utf8'):

# IDNA encoding can fail for too long labels (>63 characters)
# or missing labels (e.g. http://.example.com)
try:
netloc = parts.netloc.encode('idna')
except UnicodeError:
netloc = parts.netloc
netloc = _encode_netloc(parts.netloc)

# quote() in Python2 return type follows input type;
# quote() in Python3 always returns Unicode (native str)
Expand Down Expand Up @@ -373,10 +392,7 @@ def parse_data_uri(uri):
def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'):
# IDNA encoding can fail for too long labels (>63 characters)
# or missing labels (e.g. http://.example.com)
try:
netloc = parts.netloc.encode('idna')
except UnicodeError:
netloc = parts.netloc
netloc = _encode_netloc(parts.netloc)

return (
to_native_str(parts.scheme),
Expand Down