import boto3
import configparser
import os
import urllib3
import folium
import geopandas as gpd
import pandas as pd
import rasterio
from rasterio.plot import show
import numpy as np
from matplotlib import pyplot
import tempfile
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[1], line 1
----> 1 import boto3
2 import configparser
3 import os
ModuleNotFoundError: No module named 'boto3'
urllib3.disable_warnings()
Connection with S3 Bucket#
All GISCO Reference Grid datasets are available on S3 Bucket. Below configuration allows to list and download defined datasets from there.
def s3_connection(credentials: dict) -> boto3.session.Session:
"""Establishes a connection to an S3 bucket.
Args:
credentials (dict): A dictionary containing AWS S3 credentials with keys
'host_base', 'access_key', and 'secret_key'.
Returns:
boto3.session.Session: A boto3 session client configured with the provided
credentials for interacting with the S3 service.
"""
s3 = boto3.client('s3',
endpoint_url=credentials['host_base'],
aws_access_key_id=credentials['access_key'],
aws_secret_access_key=credentials['secret_key'],
use_ssl=True,
verify=False)
return s3
# Load s3 credentials
config = configparser.ConfigParser()
config.read('/home/eouser/.s3cfg')
credentials = dict(config['default'].items())
# Connection with S3 eodata
s3 = s3_connection(credentials)
Browsing S3 bucket content#
response = s3.list_objects_v2(Bucket='ESTAT', Prefix='GISCO_Reference_Grid')
if 'Contents' in response:
print("Objects in bucket:")
# Iterate over each object
for obj in response['Contents']:
print(obj['Key'])
else:
print("No objects found in the bucket.")
Objects in bucket:
GISCO_Reference_Grid/grid_100km.csv
GISCO_Reference_Grid/grid_100km.parquet
GISCO_Reference_Grid/grid_100km_point.gpkg
GISCO_Reference_Grid/grid_100km_surf.gpkg
GISCO_Reference_Grid/grid_10km.csv
GISCO_Reference_Grid/grid_10km.parquet
GISCO_Reference_Grid/grid_10km_point.gpkg
GISCO_Reference_Grid/grid_10km_surf.gpkg
GISCO_Reference_Grid/grid_1km.csv
GISCO_Reference_Grid/grid_1km.parquet
GISCO_Reference_Grid/grid_1km_point.gpkg
GISCO_Reference_Grid/grid_1km_surf.gpkg
GISCO_Reference_Grid/grid_20km.csv
GISCO_Reference_Grid/grid_20km.parquet
GISCO_Reference_Grid/grid_20km_point.gpkg
GISCO_Reference_Grid/grid_20km_surf.gpkg
GISCO_Reference_Grid/grid_2km.csv
GISCO_Reference_Grid/grid_2km.parquet
GISCO_Reference_Grid/grid_2km_point.gpkg
GISCO_Reference_Grid/grid_2km_surf.gpkg
GISCO_Reference_Grid/grid_50km.csv
GISCO_Reference_Grid/grid_50km.parquet
GISCO_Reference_Grid/grid_50km_point.gpkg
GISCO_Reference_Grid/grid_50km_surf.gpkg
GISCO_Reference_Grid/grid_5km.csv
GISCO_Reference_Grid/grid_5km.parquet
GISCO_Reference_Grid/grid_5km_point.gpkg
GISCO_Reference_Grid/grid_5km_surf.gpkg
Reading vector file to GeoDataFrame#
%%time
object_path = 'GISCO_Reference_Grid/grid_20km_surf.gpkg'
# Create a temporary directory to store GeoPackage file
with tempfile.TemporaryDirectory() as tmpdirname:
# Define local path to save GeoPackage file
local_geopackage_path = os.path.join(tmpdirname, object_path.split('/')[-1])
# Download the GeoPackage from S3
s3.download_file('ESTAT', object_path, local_geopackage_path)
# Read the GeoPackage into a GeoDataFrame
gdf = gpd.read_file(local_geopackage_path)
/opt/jupyterhub/lib/python3.10/site-packages/pyogrio/raw.py:198: RuntimeWarning: GPKG: unrecognized user_version=0x00000000 (0) on '/tmp/tmppsjdaaqr/grid_20km_surf.gpkg'
return ogr_read(
CPU times: user 370 ms, sys: 39.2 ms, total: 409 ms
Wall time: 551 ms
# Geodata parameters
print(gdf.info())
print('----')
print(f'Coordinate system: {gdf.crs}')
<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 19694 entries, 0 to 19693
Data columns (total 20 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 DIST_BORD 19694 non-null float64
1 TOT_P_2018 19694 non-null float64
2 TOT_P_2006 19694 non-null float64
3 GRD_ID 19694 non-null object
4 TOT_P_2011 19694 non-null float64
5 TOT_P_2021 19694 non-null int32
6 Y_LLC 19694 non-null int32
7 CNTR_ID 19694 non-null object
8 NUTS2016_3 19694 non-null object
9 NUTS2016_2 19694 non-null object
10 NUTS2016_1 19694 non-null object
11 NUTS2016_0 19694 non-null object
12 LAND_PC 19694 non-null float64
13 X_LLC 19694 non-null int32
14 NUTS2021_3 19694 non-null object
15 NUTS2021_2 19694 non-null object
16 DIST_COAST 19694 non-null float64
17 NUTS2021_1 19694 non-null object
18 NUTS2021_0 19694 non-null object
19 geometry 19694 non-null geometry
dtypes: float64(6), geometry(1), int32(3), object(10)
memory usage: 2.8+ MB
None
----
Coordinate system: EPSG:3035
gdf.head()
DIST_BORD | TOT_P_2018 | TOT_P_2006 | GRD_ID | TOT_P_2011 | TOT_P_2021 | Y_LLC | CNTR_ID | NUTS2016_3 | NUTS2016_2 | NUTS2016_1 | NUTS2016_0 | LAND_PC | X_LLC | NUTS2021_3 | NUTS2021_2 | DIST_COAST | NUTS2021_1 | NUTS2021_0 | geometry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3.981892e+06 | 0.0 | 0.0 | CRS3035RES20000mN-2780000E8700000 | 0.0 | 0 | -2780000 | FR | FRY50 | FRY5 | FRY | FR | 9.630000 | 8700000 | FRY50 | FRY5 | 4.548281e+06 | FRY | FR | POLYGON ((8700000 -2780000, 8720000 -2780000, ... |
1 | 5.029264e+06 | 0.0 | 0.0 | CRS3035RES20000mN-3040000E9960000 | 0.0 | 0 | -3040000 | FR | FRY40 | FRY4 | FRY | FR | 15.370000 | 9960000 | FRY40 | FRY4 | 5.457006e+06 | FRY | FR | POLYGON ((9960000 -3040000, 9980000 -3040000, ... |
2 | 4.076750e+05 | 1071.0 | 797.0 | CRS3035RES20000mN1460000E5580000 | 964.0 | 718 | 1460000 | EL | EL434 | EL43 | EL4 | EL | 59.720001 | 5580000 | EL434 | EL43 | 1.463860e+03 | EL4 | EL | POLYGON ((5580000 1460000, 5600000 1460000, 56... |
3 | 1.966680e+05 | 3045.0 | 2127.0 | CRS3035RES20000mN1780000E3020000 | 2636.0 | 3234 | 1780000 | ES | ES613 | ES61 | ES6 | ES | 96.940002 | 3020000 | ES613 | ES61 | 1.517438e+05 | ES6 | ES | POLYGON ((3020000 1780000, 3040000 1780000, 30... |
4 | 1.705567e+05 | 0.0 | 0.0 | CRS3035RES20000mN2920000E5960000 | 0.0 | 0 | 2920000 | UA | 100.000000 | 5960000 | 4.385629e+04 | POLYGON ((5960000 2920000, 5980000 2920000, 59... |
GeoDataFrame explanation#
GeoDataFrame inherits most of pandas DataFrame methods. That allows to work with GeoDataFrame on the same way.
# Creating new GeoDataFrame with defined columns
gdf[['NUTS2021_3','DIST_COAST','NUTS2021_1','geometry']]
NUTS2021_3 | DIST_COAST | NUTS2021_1 | geometry | |
---|---|---|---|---|
0 | FRY50 | 4.548281e+06 | FRY | POLYGON ((8700000 -2780000, 8720000 -2780000, ... |
1 | FRY40 | 5.457006e+06 | FRY | POLYGON ((9960000 -3040000, 9980000 -3040000, ... |
2 | EL434 | 1.463860e+03 | EL4 | POLYGON ((5580000 1460000, 5600000 1460000, 56... |
3 | ES613 | 1.517438e+05 | ES6 | POLYGON ((3020000 1780000, 3040000 1780000, 30... |
4 | 4.385629e+04 | POLYGON ((5960000 2920000, 5980000 2920000, 59... | ||
... | ... | ... | ... | ... |
19689 | 9.969987e+04 | POLYGON ((5860000 2920000, 5880000 2920000, 58... | ||
19690 | 8.995395e+04 | POLYGON ((5880000 2920000, 5900000 2920000, 59... | ||
19691 | 7.365964e+04 | POLYGON ((5900000 2920000, 5920000 2920000, 59... | ||
19692 | 5.880720e+04 | POLYGON ((5920000 2920000, 5940000 2920000, 59... | ||
19693 | 4.786262e+04 | POLYGON ((5940000 2920000, 5960000 2920000, 59... |
19694 rows × 4 columns
# Filtering records based on attribute value
gdf[gdf['TOT_P_2018']>0].head()
DIST_BORD | TOT_P_2018 | TOT_P_2006 | GRD_ID | TOT_P_2011 | TOT_P_2021 | Y_LLC | CNTR_ID | NUTS2016_3 | NUTS2016_2 | NUTS2016_1 | NUTS2016_0 | LAND_PC | X_LLC | NUTS2021_3 | NUTS2021_2 | DIST_COAST | NUTS2021_1 | NUTS2021_0 | geometry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | 407674.968750 | 1071.0 | 797.0 | CRS3035RES20000mN1460000E5580000 | 964.0 | 718 | 1460000 | EL | EL434 | EL43 | EL4 | EL | 59.720001 | 5580000 | EL434 | EL43 | 1463.859985 | EL4 | EL | POLYGON ((5580000 1460000, 5600000 1460000, 56... |
3 | 196668.046875 | 3045.0 | 2127.0 | CRS3035RES20000mN1780000E3020000 | 2636.0 | 3234 | 1780000 | ES | ES613 | ES61 | ES6 | ES | 96.940002 | 3020000 | ES613 | ES61 | 151743.843750 | ES6 | ES | POLYGON ((3020000 1780000, 3040000 1780000, 30... |
14 | 215925.015625 | 5175.0 | 5251.0 | CRS3035RES20000mN1780000E3040000 | 5710.0 | 4952 | 1780000 | ES | ES613 | ES61 | ES6 | ES | 98.290001 | 3040000 | ES613 | ES61 | 152737.187500 | ES6 | ES | POLYGON ((3040000 1780000, 3060000 1780000, 30... |
20 | 573108.437500 | 774.0 | 783.0 | CRS3035RES20000mN2940000E3240000 | 775.0 | 1117 | 2940000 | FR | FRH02 | FRH0 | FRH | FR | 0.660000 | 3240000 | FRH02 | FRH0 | 9194.360352 | FRH | FR | POLYGON ((3240000 2940000, 3260000 2940000, 32... |
21 | 553914.937500 | 2551.0 | 2678.0 | CRS3035RES20000mN2940000E3260000 | 2694.0 | 2681 | 2940000 | FR | FRH02 | FRH0 | FRH | FR | 4.920000 | 3260000 | FRH02 | FRH0 | 7043.680176 | FRH | FR | POLYGON ((3260000 2940000, 3280000 2940000, 32... |
Displaying geometries on basemap#
To display vector geometry on map we recommend folium. Folium allows displaying different types of geometries like Polygons, Lines and Points.
IMPORTANT: Each geometry presenting on map must be transformed to EPSG:4326 coordinates system
# Filtering many polygons
gdf_filter = gdf[gdf.NUTS2021_0 == 'FR']
# Add the polygons to the map
m1 = folium.Map(location=[46.865, 2.508], zoom_start=6)
for _, r in gdf_filter.to_crs(4326).iterrows():
sim_geo = gpd.GeoSeries(r["geometry"])
geo_j = sim_geo.to_json()
geo_j = folium.GeoJson(data=geo_j, style_function=lambda x: {"fillColor": "orange"})
folium.Popup(r["GRD_ID"]).add_to(geo_j)
geo_j.add_to(m1)
m1
Make this Notebook Trusted to load map: File -> Trust Notebook
%%time
# S3 reference to GeoTIFF
object_path = 'GISCO_Reference_Grid/grid_10km.csv'
# Create a temporary directory to store the CSV file
with tempfile.TemporaryDirectory() as tmpdirname:
# Define local path to save CSV file
local_csv_path = os.path.join(tmpdirname, object_path.split('/')[-1])
# Download CSV from S3
s3.download_file('ESTAT', object_path, local_csv_path)
# Read CSV into DataFrame
df = pd.read_csv(local_csv_path)
CPU times: user 183 ms, sys: 32 ms, total: 215 ms
Wall time: 346 ms
# Printing
df.head()
DIST_BORD | TOT_P_2018 | TOT_P_2006 | GRD_ID | TOT_P_2011 | TOT_P_2021 | Y_LLC | CNTR_ID | NUTS2016_3 | NUTS2016_2 | NUTS2016_1 | NUTS2016_0 | LAND_PC | X_LLC | NUTS2021_3 | NUTS2021_2 | DIST_COAST | NUTS2021_1 | NUTS2021_0 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3.974822e+06 | 0 | 0 | CRS3035RES10000mN-2770000E8700000 | 0 | 0 | -2770000 | FR | NaN | NaN | NaN | NaN | 0.00 | 8700000 | NaN | NaN | 4.541512e+06 | NaN | NaN |
1 | 4.010088e+06 | 0 | 0 | CRS3035RES10000mN-2780000E8740000 | 0 | 0 | -2780000 | FR | FRY50 | FRY5 | FRY | FR | 9.36 | 8740000 | FRY50 | FRY5 | 4.569328e+06 | FRY | FR |
2 | 5.016317e+05 | 3004 | 2733 | CRS3035RES10000mN1020000E2000000 | 3544 | 3056 | 1020000 | ES | ES708 | ES70 | ES7 | ES | 100.00 | 2000000 | ES708 | ES70 | 1.424885e+05 | ES7 | ES |
3 | 1.559614e+05 | 0 | 0 | CRS3035RES10000mN1640000E3080000 | 0 | 0 | 1640000 | ES | ES614-ES617 | ES61 | ES6 | ES | 100.00 | 3080000 | ES614-ES617 | ES61 | 1.274594e+04 | ES6 | ES |
4 | 4.450080e+05 | 0 | 0 | CRS3035RES10000mN2040000E6180000 | 0 | 0 | 2040000 | TR | TR412-TR521 | TR41-TR52 | TR4-TR5 | TR | 100.00 | 6180000 | TR412-TR521 | TR41-TR52 | 2.279641e+05 | TR4-TR5 | TR |
Reading Parquet file to GeoDataFrame#
%%time
object_path = 'GISCO_Reference_Grid/grid_10km.parquet'
# Create a temporary directory to store parquet file
with tempfile.TemporaryDirectory() as tmpdirname:
# Define local path to save parquet
local_parquet_path = os.path.join(tmpdirname, object_path.split('/')[-1])
# Download the parquet file from S3
s3.download_file('ESTAT', object_path, local_parquet_path)
# Read the parquet into a GeoDataFrame
df = pd.read_parquet(local_parquet_path)
CPU times: user 121 ms, sys: 23.9 ms, total: 145 ms
Wall time: 163 ms
df.info()
df.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74861 entries, 0 to 74860
Data columns (total 19 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 DIST_BORD 74861 non-null float64
1 TOT_P_2018 74861 non-null int32
2 TOT_P_2006 74861 non-null int32
3 GRD_ID 74861 non-null object
4 TOT_P_2011 74861 non-null int32
5 TOT_P_2021 74861 non-null int32
6 Y_LLC 74861 non-null int32
7 CNTR_ID 74861 non-null object
8 NUTS2016_3 64401 non-null object
9 NUTS2016_2 64401 non-null object
10 NUTS2016_1 64401 non-null object
11 NUTS2016_0 64401 non-null object
12 LAND_PC 74861 non-null float64
13 X_LLC 74861 non-null int32
14 NUTS2021_3 64402 non-null object
15 NUTS2021_2 64402 non-null object
16 DIST_COAST 74861 non-null float64
17 NUTS2021_1 64402 non-null object
18 NUTS2021_0 64402 non-null object
dtypes: float64(3), int32(6), object(10)
memory usage: 9.1+ MB
DIST_BORD | TOT_P_2018 | TOT_P_2006 | GRD_ID | TOT_P_2011 | TOT_P_2021 | Y_LLC | CNTR_ID | NUTS2016_3 | NUTS2016_2 | NUTS2016_1 | NUTS2016_0 | LAND_PC | X_LLC | NUTS2021_3 | NUTS2021_2 | DIST_COAST | NUTS2021_1 | NUTS2021_0 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3.974822e+06 | 0 | 0 | CRS3035RES10000mN-2770000E8700000 | 0 | 0 | -2770000 | FR | None | None | None | None | 0.00 | 8700000 | None | None | 4.541512e+06 | None | None |
1 | 4.010088e+06 | 0 | 0 | CRS3035RES10000mN-2780000E8740000 | 0 | 0 | -2780000 | FR | FRY50 | FRY5 | FRY | FR | 9.36 | 8740000 | FRY50 | FRY5 | 4.569328e+06 | FRY | FR |
2 | 5.016317e+05 | 3004 | 2733 | CRS3035RES10000mN1020000E2000000 | 3544 | 3056 | 1020000 | ES | ES708 | ES70 | ES7 | ES | 100.00 | 2000000 | ES708 | ES70 | 1.424885e+05 | ES7 | ES |
3 | 1.559614e+05 | 0 | 0 | CRS3035RES10000mN1640000E3080000 | 0 | 0 | 1640000 | ES | ES614-ES617 | ES61 | ES6 | ES | 100.00 | 3080000 | ES614-ES617 | ES61 | 1.274594e+04 | ES6 | ES |
4 | 4.450080e+05 | 0 | 0 | CRS3035RES10000mN2040000E6180000 | 0 | 0 | 2040000 | TR | TR412-TR521 | TR41-TR52 | TR4-TR5 | TR | 100.00 | 6180000 | TR412-TR521 | TR41-TR52 | 2.279641e+05 | TR4-TR5 | TR |
Generating geometries of the grid from coordinates in parquet file#
from shapely.geometry import Point, box
def create_geometry(df, resolution_km):
cell_size = resolution_km * 1000
df['geometry'] = df.apply(lambda row: box(row['X_LLC'], row['Y_LLC'],
row['X_LLC'] + cell_size, row['Y_LLC'] + cell_size), axis=1)
return gpd.GeoDataFrame(df, geometry='geometry')
lux_df = df[df['CNTR_ID'].str.contains('LU', na=False)].copy()
resolution_km = 10 # Set this according to your data resolution
lux_gdf = create_geometry(lux_df, resolution_km)
# Set the coordinate reference system
lux_gdf.set_crs(epsg=3035, inplace=True)
lux_gdf.head()
DIST_BORD | TOT_P_2018 | TOT_P_2006 | GRD_ID | TOT_P_2011 | TOT_P_2021 | Y_LLC | CNTR_ID | NUTS2016_3 | NUTS2016_2 | NUTS2016_1 | NUTS2016_0 | LAND_PC | X_LLC | NUTS2021_3 | NUTS2021_2 | DIST_COAST | NUTS2021_1 | NUTS2021_0 | geometry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
30836 | 8456.019531 | 8457 | 7793 | CRS3035RES10000mN2930000E4010000 | 8115 | 8550 | 2930000 | LU-FR | FRF31 | FRF3 | FRF | FR | 100.000000 | 4010000 | FRF31 | FRF3 | 238175.593750 | FRF | FR | POLYGON ((4020000 2930000, 4020000 2940000, 40... |
30837 | 4399.140137 | 41076 | 34991 | CRS3035RES10000mN2930000E4020000 | 40560 | 48170 | 2930000 | LU-FR | LU000-FRF31-FRF33 | LU00-FRF3 | LU0-FRF | LU-FR | 100.000000 | 4020000 | LU000-FRF31-FRF33 | LU00-FRF3 | 242345.468750 | LU0-FRF | LU-FR | POLYGON ((4030000 2930000, 4030000 2940000, 40... |
30838 | 296.720001 | 52368 | 43368 | CRS3035RES10000mN2930000E4030000 | 45902 | 54385 | 2930000 | LU-FR | LU000-FRF33 | LU00-FRF3 | LU0-FRF | LU-FR | 100.000000 | 4030000 | LU000-FRF33 | LU00-FRF3 | 246850.234375 | LU0-FRF | LU-FR | POLYGON ((4040000 2930000, 4040000 2940000, 40... |
30839 | 3944.389893 | 8360 | 6416 | CRS3035RES10000mN2930000E4040000 | 7028 | 8775 | 2930000 | LU-FR | LU000-FRF33 | LU00-FRF3 | LU0-FRF | LU-FR | 99.260002 | 4040000 | LU000-FRF33 | LU00-FRF3 | 251671.906250 | LU0-FRF | LU-FR | POLYGON ((4050000 2930000, 4050000 2940000, 40... |
30840 | 226.130005 | 20014 | 17188 | CRS3035RES10000mN2930000E4050000 | 18939 | 21845 | 2930000 | LU-DE-FR | DEC02-LU000-FRF33 | DEC0-LU00-FRF3 | LU0-DEC-FRF | LU-DE-FR | 98.000000 | 4050000 | DEC02-LU000-FRF33 | LU00-DEC0-FRF3 | 256792.656250 | DEC-LU0-FRF | LU-DE-FR | POLYGON ((4060000 2930000, 4060000 2940000, 40... |
m2 = folium.Map(location=[49.85, 6.15], zoom_start=10)
for _, r in lux_gdf.to_crs(4326).iterrows():
sim_geo = gpd.GeoSeries(r["geometry"])
geo_j = sim_geo.to_json()
geo_j = folium.GeoJson(data=geo_j, style_function=lambda x: {"fillColor": "orange"})
folium.Popup(r["GRD_ID"]).add_to(geo_j)
geo_j.add_to(m2)
m2
Make this Notebook Trusted to load map: File -> Trust Notebook