Source code for sar_pre_processing.file_list_sar_pre_processing

"""
Create List of SAR data which will be processed by sar_pre_processer module
"""

import os
import yaml
import fnmatch
import pyproj
import zipfile
import shutil
# import ogr
import xml.etree.ElementTree as etree
from datetime import datetime

import pdb


[docs]class AttributeDict(object):
    """
    A class to convert a nested Dictionary into an object with key-values
    accessibly using attribute notation (AttributeDict.attribute) instead of
    key notation (Dict["key"]). This class recursively sets Dicts to objects,
    allowing you to recurse down nested dicts (like: AttributeDict.attr.attr)
    """
    def __init__(self, **entries):
        self.add_entries(**entries)

[docs]    def add_entries(self, **entries):
        for key, value in entries.items():
            if type(value) is dict:
                self.__dict__[key] = AttributeDict(**value)
            else:
                self.__dict__[key] = value

    def __getitem__(self, key):
        """
        Provides dict-style access to attributes
        """
        return getattr(self, key)

[docs]class SARList(object):

    def __init__(self, **kwargs):
        self.config = kwargs.get('config', None)
        self._load_config()
        self._check()

    def _check(self):
        assert self.config is not None, 'ERROR: Configuration file needs to be provided'
        assert self.config.input_folder is not None, 'ERROR: Input folder needs to be provided'

    def _load_config(self):
        """
        Load configuration and writes to self.config
        """
        with open(self.config, 'r') as cfg:
            self.config = yaml.load(cfg)
            self.config = AttributeDict(**self.config)

    def _create_filelist(self, input_folder, expression):
        """
        Create list containing all files in input_folder (without subfolder) that contain the provided expression within the filename
        """
        filelist = []
        for root, dirnames, filenames in os.walk(input_folder):
            for filename in fnmatch.filter(filenames, expression):
                filelist.append(os.path.join(root, filename))
            break
        # print('Number of found files:', len(filelist))
        return filelist

    def _decomposition_filename(self, file):
        """
        Decomposition of filename including path in
        path, filename, fileshortname and extension
        """
        (filepath, filename) = os.path.split(file)
        (fileshortname, extension) = os.path.splitext(filename)
        return filepath, filename, fileshortname, extension

    def _select_year(self, filelist, year):
        """
        Select all S1 data in input_folder of a specific year
        """
        # position of year in filename is hard coded!!!

        filelist_new = []
        for file in filelist:
            filepath, filename, fileshortname, extension = self._decomposition_filename(
                file)
            if filename[17:21] == str(self.config.year):
                filelist_new.append(file)
            else:
                pass

        print('Number of found files for year %s:' %year, len(filelist_new))
        return filelist_new

    def _check_location(self, file, location, output_folder):
        """
        Checks of the area of interest defined by location is contained in file
        file is a map projected jpeg, the xml file containing the projection
        needs to be in the same location as sarfile

        THIS VERSION FINALLY WITH INTERSECT OF POLYGONS OGR
        """

        filepath, filename, fileshortname, extension = self._decomposition_filename(file)

        # # Various file paths and names:
        # (sarfilepath, sarfilename) = os.path.split(sarfile)
        # (sarfileshortname, extension) = os.path.splitext(sarfilename)

        # Get metadata
        # Path to product.xml-file within zipped S1 image
        xml_file = fileshortname + '.SAFE/preview/map-overlay.kml'

        # Path to product.xml file once extracted
        xml_file_extracted = os.path.join(output_folder, xml_file)

        # Extract the zipfile
        try:
            zfile = zipfile.ZipFile(file, 'r')
            zfile.extract(xml_file, output_folder)
            zfile.close()
        except:
            print('zipfile cannot open')
            contained = False
            return contained

        # Parse the xml file
        tree = etree.parse(xml_file_extracted)
        root = tree.getroot()

        # Access corners for Sentinel-1
        # Get bounding box
        for tiepoint in root.iter('{http://www.google.com/kml/ext/2.2}LatLonQuad'):
            child_list = tiepoint.getchildren()
        bounding_box = child_list[0].text
        bounding_box_list = bounding_box.split(' ')
        # WKT requires that last point = first point in polygon, add first point
        wkt_image1 = 'POLYGON((' + bounding_box + ' ' + bounding_box_list[0] + '))'
        # WKT requires other use of comma and spaces in coordinate list
        wkt_image2 = wkt_image1.replace(' ', ';')
        wkt_image3 = wkt_image2.replace(',', ' ')
        wkt_image = wkt_image3.replace(';', ',')

        # Define projections
        datasetEPSG = pyproj.Proj('+init=EPSG:4326')
        locationEPSG = pyproj.Proj('+init=EPSG:4326')

        # Transform coordinates of location into file coordinates
        upper_left_x,  upper_left_y = pyproj.transform(locationEPSG, datasetEPSG, location[0], location[1])
        lower_right_x, lower_right_y = pyproj.transform(locationEPSG, datasetEPSG, location[2], location[3])
        wkt_location = 'POLYGON((' + str(upper_left_x) + ' ' + str(upper_left_y) + ',' + str(upper_left_x) + ' ' + str(lower_right_y) + ',' + str(lower_right_x) + ' ' + str(lower_right_y) + ',' + str(lower_right_x) + ' ' + str(upper_left_y) + ',' + str(upper_left_x) + ' ' + str(upper_left_y) + '))'

        # Use ogr to check if polygon contained
        poly_location = ogr.CreateGeometryFromWkt(wkt_location)
        poly_image = ogr.CreateGeometryFromWkt(wkt_image)
        contained = poly_location.Intersect(poly_image)
        # print contained
        shutil.rmtree(os.path.join(output_folder, fileshortname + '.SAFE'))
        return contained

    def _contain_area_of_interest(self, filelist, location, output_folder):
        """
        Check if all files in input_folder contain area of interest
        """
        filelist_new = []
        for file in filelist:
            contained = self._check_location(file, location, output_folder)
            if contained is False:
                continue
            filelist_new.append(file)
        print('Number of found files containing area of interest: %s' % (len(filelist_new)))
        return filelist_new

    def _double_processed(self, filelist):
        """
        Check if two file names have the exact same time stamp (double processed data by ESA) and choose newest one

        input: file list with double processed data
        output: file list without double processed data
        """
        filelist.sort()
        filelist_new = []
        filelist_double_processed = []
        for file in filelist:
            index = filelist.index(file)
            filepath, filename, fileshortname, extension = self._decomposition_filename(
                file)

            try:
                filepath1, filename1, fileshortname1, extension1 = self._decomposition_filename(filelist[index+1])
            except IndexError:
                filename1 = ''
                pass

            try:
                if index == 0:
                    filename2 = ''
                else:
                    filepath2, filename2, fileshortname2, extension2 = self._decomposition_filename(filelist[index-1])
            except IndexError:
                filename2 = ''
                pass

            if filename[0:62] == filename1[0:62] or filename[0:62] == filename2[0:62]:
                filelist_double_processed.append(file)
            else:
                filelist_new.append(file)
        print('Number of found files that were double processed: %s' % (len(filelist_double_processed)/2.))

        filelist_end = self._check_timestamp(filelist_double_processed)
        filelist_end = filelist_end + filelist_new
        filelist_end.sort()

        return filelist_end

    def _check_processing_timestamp(self, file, file1):
        """
        check processing time stamp of input files and return file with newer time stamp
        """

        filepath, filename, fileshortname, extension = self._decomposition_filename(file)
        filepath1, filename1, fileshortname1, extension1 = self._decomposition_filename(
            file1)

        if fileshortname[0:62] == fileshortname1[0:62]:
            pass
        else:
            return

        # Get metadata
        # Path to product.safe-file within zipped Sentinel image
        xml_file = fileshortname + '.SAFE/manifest.safe'
        xml_file1 = fileshortname1 + '.SAFE/manifest.safe'

        # Extract the zipfile
        try:
            zfile = zipfile.ZipFile(file, 'r')
            zfile.extract(xml_file, filepath)
            zfile.close()
            zfile = zipfile.ZipFile(file1, 'r')
            zfile.extract(xml_file1, filepath)
            zfile.close()
        except:
            print('zipfile cannot open !!!!')
            contained = False
            return contained

        # Path to product.xml
        xml_file_extracted = os.path.join(filepath, xml_file)
        xml_file_extracted1 = os.path.join(filepath, xml_file1)

        # Parse the xml file
        tree = etree.parse(xml_file_extracted)
        root = tree.getroot()
        processing_timestamp = root.find(
            './/{http://www.esa.int/safe/sentinel-1.0}processing')
        timestamp = processing_timestamp.items()[0][1]

        tree1 = etree.parse(xml_file_extracted1)
        root1 = tree1.getroot()
        processing_timestamp1 = root1.find(
            './/{http://www.esa.int/safe/sentinel-1.0}processing')
        timestamp1 = processing_timestamp1.items()[0][1]

        shutil.rmtree(os.path.join(filepath, fileshortname + '.SAFE'))
        shutil.rmtree(os.path.join(filepath, fileshortname1 + '.SAFE'))

        if timestamp > timestamp1:
            return file
        else:
            return file1

    def _check_timestamp(self, filelist):
        """
        Sort out the newest of the double processed files
        """

        filelist_new = []
        for file in filelist:
            index = filelist.index(file)
            try:
                file1 = filelist[index+1]
            except IndexError:
                continue

            file_timestamp = self._check_processing_timestamp(file, file1)
            if file_timestamp is None:
                pass
            else:
                filelist_new.append(file_timestamp)
        return filelist_new

    def _border_control(self, filelist):
        """
        ?????
        """
        filelist.sort()
        filelist_new = []
        filelist_border_control = []
        for file in filelist:
            index = filelist.index(file)
            filepath, filename, fileshortname, extension = self._decomposition_filename(
                file)

            try:
                filepath1, filename1, fileshortname1, extension1 = self._decomposition_filename(filelist[index+1])
            except IndexError:
                filename1 = ''
                pass

            try:
                if index == 0:
                    filename2 = ''
                else:
                    filepath2, filename2, fileshortname2, extension2 = self._decomposition_filename(filelist[index-1])
            except IndexError:
                filename2 = ''
                pass

            if filename[0:25] == filename1[0:25] or filename[0:25] == filename2[0:25]:
                filelist_border_control.append(file)
            else:
                filelist_new.append(file)
        print('Number of found files with border issues: %s' % (len(filelist_border_control)))

        # pdb.set_trace()
        # filelist_end = self._check_timestamp(filelist_double_processed)
        # filelist_end = filelist_end + filelist_new
        # filelist_end.sort()

        return filelist_new, filelist_border_control




[docs]    def create_list(self, **kwargs):

        # list with all zip files found in input_folder
        filelist = self._create_filelist(self.config.input_folder, '*.zip')

        # If Year is specified in config-file pre-processing will be only done for specified year
        try:
            filelist = self._select_year(filelist, self.config.year)
            filelist.sort()
        except AttributeError:
            pass

        # If Year is specified in config-file pre-processing will be only done for specified year
        try:
            filelist = self._select_year(filelist, self.config.year)
            filelist.sort()
        except AttributeError:
            print('year not specified')
            pass


        # list with all zip files that contain area of interest
        try:
            lower_right_y = self.config.region['lr']['lat']
            upper_left_y = self.config.region['ul']['lat']
            upper_left_x = self.config.region['ul']['lon']
            lower_right_x = self.config.region['lr']['lon']
            # todo: how is it with coordinates that go across the datum line ??

            location = [upper_left_x, upper_left_y, lower_right_x, lower_right_y]

            filelist = self._contain_area_of_interest(filelist, location, self.config.input_folder)
        except AttributeError:
            print('area of interest not specified')

        # check for double processed data by ESA and choose newest one
        filelist = self._double_processed(filelist)

        filelist = self._border_control(filelist)

        # print('Number of files that will be processed: %s' % len(filelist[0]+len(filelist[1])))


        return filelist