import boto3
import configparser
import os
import urllib3

import folium
import geopandas as gpd
import pandas as pd
import rasterio
from rasterio.plot import show
import numpy as np
from matplotlib import pyplot

import tempfile
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[1], line 1
----> 1 import boto3
      2 import configparser
      3 import os

ModuleNotFoundError: No module named 'boto3'
urllib3.disable_warnings()

Connection with S3 Bucket#

All GISCO Reference Grid datasets are available on S3 Bucket. Below configuration allows to list and download defined datasets from there.

def s3_connection(credentials: dict) -> boto3.session.Session:
    """Establishes a connection to an S3 bucket.

    Args:
        credentials (dict): A dictionary containing AWS S3 credentials with keys 
                            'host_base', 'access_key', and 'secret_key'.

    Returns:
        boto3.session.Session: A boto3 session client configured with the provided 
                               credentials for interacting with the S3 service.
    """
    s3 = boto3.client('s3',
                      endpoint_url=credentials['host_base'],
                      aws_access_key_id=credentials['access_key'],
                      aws_secret_access_key=credentials['secret_key'],
                      use_ssl=True,
                      verify=False)
    return s3

# Load s3 credentials
config = configparser.ConfigParser()
config.read('/home/eouser/.s3cfg')
credentials = dict(config['default'].items())

# Connection with S3 eodata
s3 = s3_connection(credentials)

Browsing S3 bucket content#

response = s3.list_objects_v2(Bucket='ESTAT', Prefix='GISCO_Reference_Grid')
if 'Contents' in response:
    print("Objects in bucket:")
    # Iterate over each object
    for obj in response['Contents']:
        print(obj['Key'])
else:
    print("No objects found in the bucket.")
Objects in bucket:
GISCO_Reference_Grid/grid_100km.csv
GISCO_Reference_Grid/grid_100km.parquet
GISCO_Reference_Grid/grid_100km_point.gpkg
GISCO_Reference_Grid/grid_100km_surf.gpkg
GISCO_Reference_Grid/grid_10km.csv
GISCO_Reference_Grid/grid_10km.parquet
GISCO_Reference_Grid/grid_10km_point.gpkg
GISCO_Reference_Grid/grid_10km_surf.gpkg
GISCO_Reference_Grid/grid_1km.csv
GISCO_Reference_Grid/grid_1km.parquet
GISCO_Reference_Grid/grid_1km_point.gpkg
GISCO_Reference_Grid/grid_1km_surf.gpkg
GISCO_Reference_Grid/grid_20km.csv
GISCO_Reference_Grid/grid_20km.parquet
GISCO_Reference_Grid/grid_20km_point.gpkg
GISCO_Reference_Grid/grid_20km_surf.gpkg
GISCO_Reference_Grid/grid_2km.csv
GISCO_Reference_Grid/grid_2km.parquet
GISCO_Reference_Grid/grid_2km_point.gpkg
GISCO_Reference_Grid/grid_2km_surf.gpkg
GISCO_Reference_Grid/grid_50km.csv
GISCO_Reference_Grid/grid_50km.parquet
GISCO_Reference_Grid/grid_50km_point.gpkg
GISCO_Reference_Grid/grid_50km_surf.gpkg
GISCO_Reference_Grid/grid_5km.csv
GISCO_Reference_Grid/grid_5km.parquet
GISCO_Reference_Grid/grid_5km_point.gpkg
GISCO_Reference_Grid/grid_5km_surf.gpkg

Reading vector file to GeoDataFrame#

%%time 

object_path = 'GISCO_Reference_Grid/grid_20km_surf.gpkg'

# Create a temporary directory to store GeoPackage file
with tempfile.TemporaryDirectory() as tmpdirname:
    # Define local path to save GeoPackage file
    local_geopackage_path = os.path.join(tmpdirname, object_path.split('/')[-1])

    # Download the GeoPackage from S3
    s3.download_file('ESTAT', object_path, local_geopackage_path)

    # Read the GeoPackage into a GeoDataFrame
    gdf = gpd.read_file(local_geopackage_path)
/opt/jupyterhub/lib/python3.10/site-packages/pyogrio/raw.py:198: RuntimeWarning: GPKG: unrecognized user_version=0x00000000 (0) on '/tmp/tmppsjdaaqr/grid_20km_surf.gpkg'
  return ogr_read(
CPU times: user 370 ms, sys: 39.2 ms, total: 409 ms
Wall time: 551 ms
# Geodata parameters
print(gdf.info())
print('----')
print(f'Coordinate system: {gdf.crs}')
<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 19694 entries, 0 to 19693
Data columns (total 20 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   DIST_BORD   19694 non-null  float64 
 1   TOT_P_2018  19694 non-null  float64 
 2   TOT_P_2006  19694 non-null  float64 
 3   GRD_ID      19694 non-null  object  
 4   TOT_P_2011  19694 non-null  float64 
 5   TOT_P_2021  19694 non-null  int32   
 6   Y_LLC       19694 non-null  int32   
 7   CNTR_ID     19694 non-null  object  
 8   NUTS2016_3  19694 non-null  object  
 9   NUTS2016_2  19694 non-null  object  
 10  NUTS2016_1  19694 non-null  object  
 11  NUTS2016_0  19694 non-null  object  
 12  LAND_PC     19694 non-null  float64 
 13  X_LLC       19694 non-null  int32   
 14  NUTS2021_3  19694 non-null  object  
 15  NUTS2021_2  19694 non-null  object  
 16  DIST_COAST  19694 non-null  float64 
 17  NUTS2021_1  19694 non-null  object  
 18  NUTS2021_0  19694 non-null  object  
 19  geometry    19694 non-null  geometry
dtypes: float64(6), geometry(1), int32(3), object(10)
memory usage: 2.8+ MB
None
----
Coordinate system: EPSG:3035
gdf.head()
DIST_BORD TOT_P_2018 TOT_P_2006 GRD_ID TOT_P_2011 TOT_P_2021 Y_LLC CNTR_ID NUTS2016_3 NUTS2016_2 NUTS2016_1 NUTS2016_0 LAND_PC X_LLC NUTS2021_3 NUTS2021_2 DIST_COAST NUTS2021_1 NUTS2021_0 geometry
0 3.981892e+06 0.0 0.0 CRS3035RES20000mN-2780000E8700000 0.0 0 -2780000 FR FRY50 FRY5 FRY FR 9.630000 8700000 FRY50 FRY5 4.548281e+06 FRY FR POLYGON ((8700000 -2780000, 8720000 -2780000, ...
1 5.029264e+06 0.0 0.0 CRS3035RES20000mN-3040000E9960000 0.0 0 -3040000 FR FRY40 FRY4 FRY FR 15.370000 9960000 FRY40 FRY4 5.457006e+06 FRY FR POLYGON ((9960000 -3040000, 9980000 -3040000, ...
2 4.076750e+05 1071.0 797.0 CRS3035RES20000mN1460000E5580000 964.0 718 1460000 EL EL434 EL43 EL4 EL 59.720001 5580000 EL434 EL43 1.463860e+03 EL4 EL POLYGON ((5580000 1460000, 5600000 1460000, 56...
3 1.966680e+05 3045.0 2127.0 CRS3035RES20000mN1780000E3020000 2636.0 3234 1780000 ES ES613 ES61 ES6 ES 96.940002 3020000 ES613 ES61 1.517438e+05 ES6 ES POLYGON ((3020000 1780000, 3040000 1780000, 30...
4 1.705567e+05 0.0 0.0 CRS3035RES20000mN2920000E5960000 0.0 0 2920000 UA 100.000000 5960000 4.385629e+04 POLYGON ((5960000 2920000, 5980000 2920000, 59...

GeoDataFrame explanation#

GeoDataFrame inherits most of pandas DataFrame methods. That allows to work with GeoDataFrame on the same way.

# Creating new GeoDataFrame with defined columns
gdf[['NUTS2021_3','DIST_COAST','NUTS2021_1','geometry']]
NUTS2021_3 DIST_COAST NUTS2021_1 geometry
0 FRY50 4.548281e+06 FRY POLYGON ((8700000 -2780000, 8720000 -2780000, ...
1 FRY40 5.457006e+06 FRY POLYGON ((9960000 -3040000, 9980000 -3040000, ...
2 EL434 1.463860e+03 EL4 POLYGON ((5580000 1460000, 5600000 1460000, 56...
3 ES613 1.517438e+05 ES6 POLYGON ((3020000 1780000, 3040000 1780000, 30...
4 4.385629e+04 POLYGON ((5960000 2920000, 5980000 2920000, 59...
... ... ... ... ...
19689 9.969987e+04 POLYGON ((5860000 2920000, 5880000 2920000, 58...
19690 8.995395e+04 POLYGON ((5880000 2920000, 5900000 2920000, 59...
19691 7.365964e+04 POLYGON ((5900000 2920000, 5920000 2920000, 59...
19692 5.880720e+04 POLYGON ((5920000 2920000, 5940000 2920000, 59...
19693 4.786262e+04 POLYGON ((5940000 2920000, 5960000 2920000, 59...

19694 rows × 4 columns

# Filtering records based on attribute value 
gdf[gdf['TOT_P_2018']>0].head()
DIST_BORD TOT_P_2018 TOT_P_2006 GRD_ID TOT_P_2011 TOT_P_2021 Y_LLC CNTR_ID NUTS2016_3 NUTS2016_2 NUTS2016_1 NUTS2016_0 LAND_PC X_LLC NUTS2021_3 NUTS2021_2 DIST_COAST NUTS2021_1 NUTS2021_0 geometry
2 407674.968750 1071.0 797.0 CRS3035RES20000mN1460000E5580000 964.0 718 1460000 EL EL434 EL43 EL4 EL 59.720001 5580000 EL434 EL43 1463.859985 EL4 EL POLYGON ((5580000 1460000, 5600000 1460000, 56...
3 196668.046875 3045.0 2127.0 CRS3035RES20000mN1780000E3020000 2636.0 3234 1780000 ES ES613 ES61 ES6 ES 96.940002 3020000 ES613 ES61 151743.843750 ES6 ES POLYGON ((3020000 1780000, 3040000 1780000, 30...
14 215925.015625 5175.0 5251.0 CRS3035RES20000mN1780000E3040000 5710.0 4952 1780000 ES ES613 ES61 ES6 ES 98.290001 3040000 ES613 ES61 152737.187500 ES6 ES POLYGON ((3040000 1780000, 3060000 1780000, 30...
20 573108.437500 774.0 783.0 CRS3035RES20000mN2940000E3240000 775.0 1117 2940000 FR FRH02 FRH0 FRH FR 0.660000 3240000 FRH02 FRH0 9194.360352 FRH FR POLYGON ((3240000 2940000, 3260000 2940000, 32...
21 553914.937500 2551.0 2678.0 CRS3035RES20000mN2940000E3260000 2694.0 2681 2940000 FR FRH02 FRH0 FRH FR 4.920000 3260000 FRH02 FRH0 7043.680176 FRH FR POLYGON ((3260000 2940000, 3280000 2940000, 32...

Displaying geometries on basemap#

To display vector geometry on map we recommend folium. Folium allows displaying different types of geometries like Polygons, Lines and Points.
IMPORTANT: Each geometry presenting on map must be transformed to EPSG:4326 coordinates system

# Filtering many polygons
gdf_filter = gdf[gdf.NUTS2021_0 == 'FR']
# Add the polygons to the map

m1 = folium.Map(location=[46.865, 2.508], zoom_start=6)

for _, r in gdf_filter.to_crs(4326).iterrows():
    sim_geo = gpd.GeoSeries(r["geometry"])
    geo_j = sim_geo.to_json()
    geo_j = folium.GeoJson(data=geo_j, style_function=lambda x: {"fillColor": "orange"})
    folium.Popup(r["GRD_ID"]).add_to(geo_j)
    geo_j.add_to(m1)

m1
Make this Notebook Trusted to load map: File -> Trust Notebook
%%time 

# S3 reference to GeoTIFF
object_path = 'GISCO_Reference_Grid/grid_10km.csv'

# Create a temporary directory to store the CSV file
with tempfile.TemporaryDirectory() as tmpdirname:
    # Define local path to save CSV file
    local_csv_path = os.path.join(tmpdirname, object_path.split('/')[-1])

    # Download CSV from S3
    s3.download_file('ESTAT', object_path, local_csv_path)

    # Read CSV into DataFrame
    df = pd.read_csv(local_csv_path)
CPU times: user 183 ms, sys: 32 ms, total: 215 ms
Wall time: 346 ms
# Printing 

df.head()
DIST_BORD TOT_P_2018 TOT_P_2006 GRD_ID TOT_P_2011 TOT_P_2021 Y_LLC CNTR_ID NUTS2016_3 NUTS2016_2 NUTS2016_1 NUTS2016_0 LAND_PC X_LLC NUTS2021_3 NUTS2021_2 DIST_COAST NUTS2021_1 NUTS2021_0
0 3.974822e+06 0 0 CRS3035RES10000mN-2770000E8700000 0 0 -2770000 FR NaN NaN NaN NaN 0.00 8700000 NaN NaN 4.541512e+06 NaN NaN
1 4.010088e+06 0 0 CRS3035RES10000mN-2780000E8740000 0 0 -2780000 FR FRY50 FRY5 FRY FR 9.36 8740000 FRY50 FRY5 4.569328e+06 FRY FR
2 5.016317e+05 3004 2733 CRS3035RES10000mN1020000E2000000 3544 3056 1020000 ES ES708 ES70 ES7 ES 100.00 2000000 ES708 ES70 1.424885e+05 ES7 ES
3 1.559614e+05 0 0 CRS3035RES10000mN1640000E3080000 0 0 1640000 ES ES614-ES617 ES61 ES6 ES 100.00 3080000 ES614-ES617 ES61 1.274594e+04 ES6 ES
4 4.450080e+05 0 0 CRS3035RES10000mN2040000E6180000 0 0 2040000 TR TR412-TR521 TR41-TR52 TR4-TR5 TR 100.00 6180000 TR412-TR521 TR41-TR52 2.279641e+05 TR4-TR5 TR

Reading Parquet file to GeoDataFrame#

%%time

object_path = 'GISCO_Reference_Grid/grid_10km.parquet'

# Create a temporary directory to store parquet file
with tempfile.TemporaryDirectory() as tmpdirname:
    # Define local path to save parquet
    local_parquet_path = os.path.join(tmpdirname, object_path.split('/')[-1])

    # Download the parquet file from S3
    s3.download_file('ESTAT', object_path, local_parquet_path)

    # Read the parquet into a GeoDataFrame
    df = pd.read_parquet(local_parquet_path)
CPU times: user 121 ms, sys: 23.9 ms, total: 145 ms
Wall time: 163 ms
df.info()
df.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74861 entries, 0 to 74860
Data columns (total 19 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   DIST_BORD   74861 non-null  float64
 1   TOT_P_2018  74861 non-null  int32  
 2   TOT_P_2006  74861 non-null  int32  
 3   GRD_ID      74861 non-null  object 
 4   TOT_P_2011  74861 non-null  int32  
 5   TOT_P_2021  74861 non-null  int32  
 6   Y_LLC       74861 non-null  int32  
 7   CNTR_ID     74861 non-null  object 
 8   NUTS2016_3  64401 non-null  object 
 9   NUTS2016_2  64401 non-null  object 
 10  NUTS2016_1  64401 non-null  object 
 11  NUTS2016_0  64401 non-null  object 
 12  LAND_PC     74861 non-null  float64
 13  X_LLC       74861 non-null  int32  
 14  NUTS2021_3  64402 non-null  object 
 15  NUTS2021_2  64402 non-null  object 
 16  DIST_COAST  74861 non-null  float64
 17  NUTS2021_1  64402 non-null  object 
 18  NUTS2021_0  64402 non-null  object 
dtypes: float64(3), int32(6), object(10)
memory usage: 9.1+ MB
DIST_BORD TOT_P_2018 TOT_P_2006 GRD_ID TOT_P_2011 TOT_P_2021 Y_LLC CNTR_ID NUTS2016_3 NUTS2016_2 NUTS2016_1 NUTS2016_0 LAND_PC X_LLC NUTS2021_3 NUTS2021_2 DIST_COAST NUTS2021_1 NUTS2021_0
0 3.974822e+06 0 0 CRS3035RES10000mN-2770000E8700000 0 0 -2770000 FR None None None None 0.00 8700000 None None 4.541512e+06 None None
1 4.010088e+06 0 0 CRS3035RES10000mN-2780000E8740000 0 0 -2780000 FR FRY50 FRY5 FRY FR 9.36 8740000 FRY50 FRY5 4.569328e+06 FRY FR
2 5.016317e+05 3004 2733 CRS3035RES10000mN1020000E2000000 3544 3056 1020000 ES ES708 ES70 ES7 ES 100.00 2000000 ES708 ES70 1.424885e+05 ES7 ES
3 1.559614e+05 0 0 CRS3035RES10000mN1640000E3080000 0 0 1640000 ES ES614-ES617 ES61 ES6 ES 100.00 3080000 ES614-ES617 ES61 1.274594e+04 ES6 ES
4 4.450080e+05 0 0 CRS3035RES10000mN2040000E6180000 0 0 2040000 TR TR412-TR521 TR41-TR52 TR4-TR5 TR 100.00 6180000 TR412-TR521 TR41-TR52 2.279641e+05 TR4-TR5 TR

Generating geometries of the grid from coordinates in parquet file#

from shapely.geometry import Point, box

def create_geometry(df, resolution_km):
    cell_size = resolution_km * 1000
    df['geometry'] = df.apply(lambda row: box(row['X_LLC'], row['Y_LLC'], 
                                              row['X_LLC'] + cell_size, row['Y_LLC'] + cell_size), axis=1)
    return gpd.GeoDataFrame(df, geometry='geometry')

lux_df = df[df['CNTR_ID'].str.contains('LU', na=False)].copy()

resolution_km = 10  # Set this according to your data resolution
lux_gdf = create_geometry(lux_df, resolution_km)

# Set the coordinate reference system
lux_gdf.set_crs(epsg=3035, inplace=True) 

lux_gdf.head()
DIST_BORD TOT_P_2018 TOT_P_2006 GRD_ID TOT_P_2011 TOT_P_2021 Y_LLC CNTR_ID NUTS2016_3 NUTS2016_2 NUTS2016_1 NUTS2016_0 LAND_PC X_LLC NUTS2021_3 NUTS2021_2 DIST_COAST NUTS2021_1 NUTS2021_0 geometry
30836 8456.019531 8457 7793 CRS3035RES10000mN2930000E4010000 8115 8550 2930000 LU-FR FRF31 FRF3 FRF FR 100.000000 4010000 FRF31 FRF3 238175.593750 FRF FR POLYGON ((4020000 2930000, 4020000 2940000, 40...
30837 4399.140137 41076 34991 CRS3035RES10000mN2930000E4020000 40560 48170 2930000 LU-FR LU000-FRF31-FRF33 LU00-FRF3 LU0-FRF LU-FR 100.000000 4020000 LU000-FRF31-FRF33 LU00-FRF3 242345.468750 LU0-FRF LU-FR POLYGON ((4030000 2930000, 4030000 2940000, 40...
30838 296.720001 52368 43368 CRS3035RES10000mN2930000E4030000 45902 54385 2930000 LU-FR LU000-FRF33 LU00-FRF3 LU0-FRF LU-FR 100.000000 4030000 LU000-FRF33 LU00-FRF3 246850.234375 LU0-FRF LU-FR POLYGON ((4040000 2930000, 4040000 2940000, 40...
30839 3944.389893 8360 6416 CRS3035RES10000mN2930000E4040000 7028 8775 2930000 LU-FR LU000-FRF33 LU00-FRF3 LU0-FRF LU-FR 99.260002 4040000 LU000-FRF33 LU00-FRF3 251671.906250 LU0-FRF LU-FR POLYGON ((4050000 2930000, 4050000 2940000, 40...
30840 226.130005 20014 17188 CRS3035RES10000mN2930000E4050000 18939 21845 2930000 LU-DE-FR DEC02-LU000-FRF33 DEC0-LU00-FRF3 LU0-DEC-FRF LU-DE-FR 98.000000 4050000 DEC02-LU000-FRF33 LU00-DEC0-FRF3 256792.656250 DEC-LU0-FRF LU-DE-FR POLYGON ((4060000 2930000, 4060000 2940000, 40...
m2 = folium.Map(location=[49.85, 6.15], zoom_start=10)

for _, r in lux_gdf.to_crs(4326).iterrows():
    sim_geo = gpd.GeoSeries(r["geometry"])
    geo_j = sim_geo.to_json()
    geo_j = folium.GeoJson(data=geo_j, style_function=lambda x: {"fillColor": "orange"})
    folium.Popup(r["GRD_ID"]).add_to(geo_j)
    geo_j.add_to(m2)

m2
Make this Notebook Trusted to load map: File -> Trust Notebook