# Copyright (c) 2012 The Chromium OS Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. """Utilities for standard operations on URIs of different kinds.""" from __future__ import print_function import re import sys import urllib import urllib2 from chromite.lib.paygen import filelib from chromite.lib.paygen import gslib # This module allows files from different storage types to be handled # in a common way, for supported operations. PROTOCOL_GS = gslib.PROTOCOL PROTOCOL_HTTP = 'http' PROTOCOL_HTTPS = 'https' PROTOCOLS = (PROTOCOL_GS, PROTOCOL_HTTP, PROTOCOL_HTTPS) PROTOCOL_SEP = '://' EXTRACT_PROTOCOL_RE = re.compile(r'^(\w+)%s' % PROTOCOL_SEP) SPLIT_URI_RE = re.compile(r'^(\w+)%s(.*)$' % PROTOCOL_SEP) TYPE_GS = PROTOCOL_GS TYPE_HTTP = PROTOCOL_HTTP TYPE_HTTPS = PROTOCOL_HTTPS TYPE_LOCAL = 'file' class NotSupportedForType(RuntimeError): """Raised when operation is not supported for a particular file type""" def __init__(self, uri_type, extra_msg=None): # pylint: disable=protected-access function = sys._getframe(1).f_code.co_name msg = 'Function %s not supported for %s URIs' % (function, uri_type) if extra_msg: msg += ', ' + extra_msg RuntimeError.__init__(self, msg) class NotSupportedForTypes(RuntimeError): """Raised when operation is not supported for all particular file type""" def __init__(self, extra_msg=None, *uri_types): # pylint: disable=protected-access function = sys._getframe(1).f_code.co_name msg = ('Function %s not supported for set of URIs with types: %s' % (function, ', '.join(uri_types))) if extra_msg: msg += ', ' + extra_msg RuntimeError.__init__(self, msg) class NotSupportedBetweenTypes(RuntimeError): """Raised when operation is not supported between particular file types""" def __init__(self, uri_type1, uri_type2, extra_msg=None): # pylint: disable=protected-access function = sys._getframe(1).f_code.co_name msg = ('Function %s not supported between %s and %s URIs' % (function, uri_type1, uri_type2)) if extra_msg: msg += ', ' + extra_msg RuntimeError.__init__(self, msg) class MissingURLError(RuntimeError): """Raised when nothing exists at URL.""" def ExtractProtocol(uri): """Take a URI and return the protocol it is using, if any. Examples: 'gs://some/path' ==> 'gs' 'file:///some/path' ==> 'file' '/some/path' ==> None '/cns/some/colossus/path' ==> None Args: uri: The URI to get protocol from. Returns: Protocol string that is found, or None. """ match = EXTRACT_PROTOCOL_RE.search(uri) if match: return match.group(1) return None def GetUriType(uri): """Get the type of a URI. See the TYPE_* constants for examples. This is mostly based on URI protocols, with Colossus and local files as exceptions. Args: uri: The URI to consider Returns: The URI type. """ protocol = ExtractProtocol(uri) if protocol: return protocol return TYPE_LOCAL def SplitURI(uri): """Get the protocol and path from a URI Examples: 'gs://some/path' ==> ('gs', 'some/path') 'file:///some/path' ==> ('file', '/some/path') '/some/path' ==> (None, '/some/path') '/cns/some/colossus/path' ==> (None, '/cns/some/colossus/path') Args: uri: The uri to get protocol and path from. Returns; Tuple (protocol, path) """ match = SPLIT_URI_RE.search(uri) if match: return (match.group(1), match.group(2)) return (None, uri) def IsGsURI(uri): """Returns True if given uri uses Google Storage protocol.""" return PROTOCOL_GS == ExtractProtocol(uri) def IsFileURI(uri): """Return True if given uri is a file URI (or path). If uri uses the file protocol or it is a plain non-Colossus path then return True. Args: uri: Any URI or path. Returns: True or False as described above. """ return TYPE_LOCAL == GetUriType(uri) def IsHttpURI(uri, https_ok=False): """Returns True if given uri uses http, or optionally https, protocol. Args: uri: The URI to check. https_ok: If True, then accept https protocol as well. Returns: Boolean """ uri_type = GetUriType(uri) return TYPE_HTTP == uri_type or (https_ok and TYPE_HTTPS == uri_type) def IsHttpsURI(uri): """Returns True if given uri uses https protocol.""" return TYPE_HTTPS == GetUriType(uri) def MD5Sum(uri): """Compute or retrieve MD5 sum of uri. Supported for: local files, GS files. Args: uri: The /unix/path or gs:// uri to compute the md5sum on. Returns: A string representing the md5sum of the file/uri passed in. None if we do not understand the uri passed in or cannot compute the md5sum. """ uri_type = GetUriType(uri) if uri_type == TYPE_LOCAL: return filelib.MD5Sum(uri) elif uri_type == TYPE_GS: try: return gslib.MD5Sum(uri) except gslib.GSLibError: return None # Colossus does not have a command for getting MD5 sum. We could # copy the file to local disk and calculate it, but it seems better # to explicitly say it is not supported. raise NotSupportedForType(uri_type) def Cmp(uri1, uri2): """Return True if paths hold identical files. If either file is missing then always return False. Args: uri1: URI to a file. uri2: URI to a file. Returns: True if files are the same, False otherwise. Raises: NotSupportedBetweenTypes if Cmp cannot be done between the two URIs provided. """ uri_type1 = GetUriType(uri1) uri_type2 = GetUriType(uri2) uri_types = set([uri_type1, uri_type2]) if TYPE_GS in uri_types: # GS only supported between other GS files or local files. if len(uri_types) == 1 or TYPE_LOCAL in uri_types: return gslib.Cmp(uri1, uri2) if TYPE_LOCAL in uri_types and len(uri_types) == 1: return filelib.Cmp(uri1, uri2) raise NotSupportedBetweenTypes(uri_type1, uri_type2) class URLopener(urllib.FancyURLopener): """URLopener that will actually complain when download fails.""" # The urllib.urlretrieve function, which seems like a good fit for this, # does not give access to error code. def http_error_default(self, *args, **kwargs): urllib.URLopener.http_error_default(self, *args, **kwargs) def URLRetrieve(src_url, dest_path): """Download file from given URL to given local file path. Args: src_url: URL to download from. dest_path: Path to download to. Raises: MissingURLError if URL cannot be downloaded. """ opener = URLopener() try: opener.retrieve(src_url, dest_path) except IOError as e: # If the domain is valid but download failed errno shows up as None. if e.errno is None: raise MissingURLError('Unable to download %s' % src_url) # If the domain is invalid the errno shows up as 'socket error', weirdly. try: int(e.errno) # This means there was some normal error writing to the dest_path. raise except ValueError: raise MissingURLError('Unable to download %s (bad domain?)' % src_url) def Copy(src_uri, dest_uri): """Copy one uri to another. Args: src_uri: URI to copy from. dest_uri: Path to copy to. Raises: NotSupportedBetweenTypes if Cmp cannot be done between the two URIs provided. """ uri_type1 = GetUriType(src_uri) uri_type2 = GetUriType(dest_uri) uri_types = set([uri_type1, uri_type2]) if TYPE_GS in uri_types: # GS only supported between other GS files or local files. if len(uri_types) == 1 or TYPE_LOCAL in uri_types: return gslib.Copy(src_uri, dest_uri) if TYPE_LOCAL in uri_types and len(uri_types) == 1: return filelib.Copy(src_uri, dest_uri) if uri_type1 in (TYPE_HTTP, TYPE_HTTPS) and uri_type2 == TYPE_LOCAL: # Download file from URL. return URLRetrieve(src_uri, dest_uri) raise NotSupportedBetweenTypes(uri_type1, uri_type2) def Remove(*args, **kwargs): """Delete the file(s) at uris, or directory(s) with recurse set. Args: args: One or more URIs. ignore_no_match: If True, then do not complain if anything was not removed because no URI match was found. Like rm -f. Defaults to False. recurse: Remove recursively starting at path. Same as rm -R. Defaults to False. """ uri_types = set([GetUriType(u) for u in args]) if TYPE_GS in uri_types: # GS support only allows local files among list. if len(uri_types) == 1 or (TYPE_LOCAL in uri_types and len(uri_types) == 2): return gslib.Remove(*args, **kwargs) if TYPE_LOCAL in uri_types and len(uri_types) == 1: return filelib.Remove(*args, **kwargs) raise NotSupportedForTypes(*list(uri_types)) def Size(uri): """Return size of file at URI in bytes. Args: uri: URI to consider Returns: Size of file at given URI in bytes. Raises: MissingURLError if uri is a URL and cannot be found. """ uri_type = GetUriType(uri) if TYPE_GS == uri_type: return gslib.FileSize(uri) if TYPE_LOCAL == uri_type: return filelib.Size(uri) if TYPE_HTTP == uri_type or TYPE_HTTPS == uri_type: try: response = urllib2.urlopen(uri) if response.getcode() == 200: return int(response.headers.getheader('Content-Length')) except urllib2.HTTPError as e: # Interpret 4** errors as our own MissingURLError. if e.code < 400 or e.code >= 500: raise raise MissingURLError('No such file at URL %s' % uri) raise NotSupportedForType(uri_type) def Exists(uri, as_dir=False): """Return True if file exists at given URI. If URI is a directory and as_dir is False then this will return False. Args: uri: URI to consider as_dir: If True then check URI as a directory, otherwise check as a file. Returns: True if file (or directory) exists at URI, False otherwise. """ uri_type = GetUriType(uri) if TYPE_GS == uri_type: if as_dir: # GS does not contain directories. return False return gslib.Exists(uri) if TYPE_LOCAL == uri_type: return filelib.Exists(uri, as_dir=as_dir) if TYPE_HTTP == uri_type or TYPE_HTTPS == uri_type: if as_dir: raise NotSupportedForType(uri_type, extra_msg='with as_dir=True') try: response = urllib2.urlopen(uri) return response.getcode() == 200 except urllib2.HTTPError: return False raise NotSupportedForType(uri_type) def ListFiles(root_path, recurse=False, filepattern=None, sort=False): """Return list of file paths under given root path. Directories are intentionally excluded from results. The root_path argument can be a local directory path, a Google storage directory URI, or a Colossus (/cns) directory path. Args: root_path: A local path, CNS path, or GS path to directory. recurse: Look for files in subdirectories, as well filepattern: glob pattern to match against basename of file sort: If True then do a default sort on paths Returns: List of paths to files that matched """ uri_type = GetUriType(root_path) if TYPE_GS == uri_type: return gslib.ListFiles(root_path, recurse=recurse, filepattern=filepattern, sort=sort) if TYPE_LOCAL == uri_type: return filelib.ListFiles(root_path, recurse=recurse, filepattern=filepattern, sort=sort) raise NotSupportedForType(uri_type) def CopyFiles(src_dir, dst_dir): """Recursively copy all files from src_dir into dst_dir This leverages the Copy method, so the restrictions there for what copies are supported apply here. Args: src_dir: A local, CNS, or GS directory to copy from. dst_dir: A local, CNS, or GS directory to copy into. Returns: A list of absolute path files for all copied files. """ dst_paths = [] src_paths = ListFiles(src_dir, recurse=True) for src_path in src_paths: dst_path = src_path.replace(src_dir, dst_dir) Copy(src_path, dst_path) dst_paths.append(dst_path) return dst_paths def RemoveDirContents(base_dir): """Remove all contents of a directory. Args: base_dir: directory to delete contents of. """ uri_type = GetUriType(base_dir) if TYPE_GS == uri_type: return gslib.RemoveDirContents(base_dir) if TYPE_LOCAL == uri_type: return filelib.RemoveDirContents(base_dir) raise NotSupportedForType(uri_type)