Source code for scraper_helper.text

import re


def get_clean_currency(param, keep_comma=False, keep_period=True):
    if not keep_comma and keep_period:
        pattren = r'[\d.]*'
    elif keep_comma and not keep_period:
        pattren = r'[\d,]*'
    else:
        pattren = r'[\d,.]*'

    result = re.search(pattren, param)
    if result:
        return result.group(0)
    else:
        return None


[docs] def cleanup(s): """ Takes a string and cleans it by removing newline, tab and whitespace. @param s: Any string @return: Cleaned up string """ if s: r = re.sub('(\r\n)(\t)', ' ', s).strip() r = ' '.join([x for x in r.split()]) if r: r = r.replace('\xa0', ' ') # &nbsp to space return r else: return None
[docs] def get_headers(s: str, sep: str = ': ', strip_cookie: bool = True, strip_cl: bool = True, strip_headers: list = []) -> dict: """get_headers will be deprecated. Use get_dict instead """ return get_dict(s, sep, strip_cookie, strip_cl, strip_headers)
[docs] def get_dict(s, sep=': ', strip_cookie=True, strip_cl=True, strip_headers: list = []) -> dict: """Takes headers copied from dev tools and converts to string. Note that this consider each line as new dictionary key. Thus pass input as string in triple quotes. Example Input: ''' accept: */* accept-encoding: gzip, deflate, br ''' Example Output: {'accept': '*/*', 'accept-encoding': 'gzip, deflate, br'} @param s: Input string in triple quotes @param sep: The separator for key and value. Defaults to : @param strip_cookie: Remove cookies. Defaults to True @param strip_cl: Remove content-length: Defaults to True @param strip_headers: Optional list of keys that needs to be excluded @return: dictionary @rtype: dict """ d = dict() for kv in s.split('\n'): kv = kv.strip() if kv and sep in kv: v = '' k = kv.split(sep)[0] if len(kv.split(sep)) == 1: v = '' else: v = kv.split(sep)[1] if k[:1] == ":": continue if strip_cookie and k.lower() == 'cookie': continue if strip_cl and k.lower() == 'content-length': continue if k in strip_headers: continue d[k] = v return d
def headers(browser="chrome"): header_dictionary = {'accept': '*/*', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'en-US,en;q=0.9', 'cache-control': 'no-cache', 'pragma': 'no-cache', 'referer': 'https://www.google.com/', 'sec-ch-ua': '"Chromium";v="88", "Google Chrome";v="88", ";Not A Brand";v="99"', 'sec-ch-ua-mobile': '?0', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36'} if browser.lower() == "chrome": return header_dictionary elif browser.lower() == "firefox": header_dictionary['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:85.0) Gecko/20100101 Firefox/85.0' return header_dictionary else: return header_dictionary