Source code for scraper_helper.url
from urllib.parse import parse_qsl
from urllib.parse import urlencode
from urllib.parse import urlparse
[docs]
def change_param(url, param, new_value, create_new=False, upgrade_https=False):
""" Takes a url and changes the value of a query string parameter.
@param url: The input url
@param param: The name of the query string parameter that needs to be change
@param new_value: The new value for the parameter
@param create_new: If set to True, will create a new query string parameter
@param upgrade_https: If set to true, will upgrade to HTTPS
@return: Updated URL
"""
if not url:
raise ValueError("URL cannot be null")
elif "?" in url:
q = url.split("?")[1]
d = dict(parse_qsl(q))
d[f"{param}"] = new_value
new_url = url.split("?")[0] + "?" + urlencode(d)
elif create_new:
return url + "?" + urlencode({param: new_value})
else:
return url
if upgrade_https:
return new_url.replace("http://", "https://")
else:
return new_url
[docs]
def get_query_str_val(url: str, qs: str) -> str:
"""Takes a url and extract value of a query string parameter.
@rtype: str
"""
if not url:
raise ValueError("URL cannot be null")
elif "?" in url:
q = url.split("?")[1]
d = dict(parse_qsl(q))
return d.get(qs)
[docs]
def strip_qs_params(url):
"""Takes a url and strips all query string parameters.
@param url: Any url like https://coderecode.com/scrapy-crash-course?src=git
@return: full url without parameters: https://coderecode.com/scrapy-crash-course
"""
if not url:
raise ValueError("URL cannot be null")
u = urlparse(url)
return f"{u.scheme}://{u.netloc}{u.path}"
[docs]
def get_root_address(url):
"""Takes a url and strips returns the root url
@param url: Any url like https://coderecode.com/scrapy-crash-course?src=git
@return: full url without parameters: https://coderecode.com/
"""
if not url:
raise ValueError("URL cannot be null")
u = urlparse(url)
return f"{u.scheme}://{u.netloc}"