Source code for railgun.common.url

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# @file: railgun/common/url.py
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# This file is released under BSD 2-clause license.

import re


[docs]class UrlMatcher(object):
    """A useful class to find all the urls from given text, and replace them
    with a callback function.

    For example, if you want to replace all ``file:///`` urls to ``http://``
    ones, where all the files are located under ``/var/www/share/``, and all
    the http urls should start with ``http://localhost/files/``, then we
    may write::

        def ReplaceUrl(url):
            if url.startswith('file:///var/www/share/'):
                return 'http://localhost/files/%s' % url[22:]
            return url

        matcher = UrlMatcher(schemas=['file'])
        payload = "Here is the movie: " \\
            "file:///var/www/share/favourites/Harry-Potter.mov"
        print matcher.replace(payload, ReplaceUrl)

    Note that we only consider the following characters as components of
    urls::

        A-Z, a-z, 0-9, and any one of "-_.~!*';:@&=+$,/?#"

    :param schemas: Interested url schemas.
    :type schemas: :class:`list`
    """

    def __init__(self, schemas=['http', 'https', 'ftp']):
        schema_pattern = '|'.join(schemas)
        self.pattern = (r"(%s)://[A-Za-z0-9-_.~!\*';:@&=+$,/?#]*" %
                        schema_pattern)
        self.regex = re.compile(self.pattern)

[docs]    def findall(self, payload):
        """Get all matching urls from given `payload`.

        :param payload: Text that may contain urls.
        :type payload: :class:`str`
        :return: Iterable urls.
        """
        for m in self.regex.finditer(payload):
            yield m.group()

[docs]    def replace(self, payload, callback):
        """Replace all matching urls in given `payload` with `callback`.

        :param payload: Text that may contain urls.
        :type payload: :class:`str`
        :param callback: Function to generate new urls from old ones.
        :type callback: method(:class:`str`) -> :class:`str`

        :return: Replaced text.
        """
        cb = lambda m: callback(m.group())
        return self.regex.sub(cb, payload)


[docs]def reform_path(path):
    """Reformat the given path to Unix style.

    The given path will be modified according to following rules:

    *   "/" is the delimieter among different components of the path.
    *   "\\\\" will be treated as "/".
    *   Continous "/" will be considered as one.
    *   Component "." will be removed from the path.
    *   Component ".." will consume one parent in the path.
    *   A leading "/" will be reserved, while a trailing "/" will be removed.
    *   Other components will be output without translation.

    There's some special cases:

    *   "/" will result in "/", since the only slash is both a leading and
        a trailing one.
    *   Empty string will remain empty.

    Examples::

        >>> reform_path('1\\\\2')
        '1/2'
        >>> reform_path('////1////2')
        '/1/2'
        >>> reform_path('/1/2/3/../4/../..')
        '/1'
        >>> reform_path('/1/..')
        '/'
        >>> reform_path('../')
        Traceback (most recent call last):
          File "a.py", line 63, in <module>
            reform_path('../')
          ...
        ValueError: .. out of root

    :param path: Original path string.
    :type path: :class:`str`
    :return: The translated new path.

    :raises: :class:`ValueError` if ".." could not find any parent to consume.
    """

    path = path.replace('\\', '/')
    lead_slash = path.startswith('/')
    ret = []

    for p in path.split('/'):
        # skip continous slashes, or single '.'
        if p == '.' or not p:
            continue
        # remove parent dir if p is '..'
        if p == '..':
            if not ret:
                raise ValueError('.. out of root')
            ret.pop()
        # otherwise add the simple part into ret
        else:
            ret.append(p)

    ret = '/'.join(ret)
    if lead_slash:
        ret = '/' + ret
    return ret
Navigation

Source code for railgun.common.url

Quick search