Connection with S3 Bucket

Connection with S3 Bucket#

All Census GRID datasets are available on S3 Bucket. Below configuration allows to list and download defined datasets from there.

def s3_connection(credentials: dict) -> boto3.session.Session:
    """Establishes a connection to an S3 bucket.

    Args:
        credentials (dict): A dictionary containing AWS S3 credentials with keys 
                            'host_base', 'access_key', and 'secret_key'.

    Returns:
        boto3.session.Session: A boto3 session client configured with the provided 
                               credentials for interacting with the S3 service.
    """
    s3 = boto3.client('s3',
                      endpoint_url=credentials['host_base'],
                      aws_access_key_id=credentials['access_key'],
                      aws_secret_access_key=credentials['secret_key'],
                      use_ssl=True,
                      verify=False)
    return s3

# Load s3 credentials
config = configparser.ConfigParser()
config.read('/home/eouser/.s3cfg')
credentials = dict(config['default'].items())

# Connection with S3 eodata
s3 = s3_connection(credentials)

Browsing S3 bucket content#

response = s3.list_objects_v2(Bucket='ESTAT', Prefix='Census_GRID')

if 'Contents' in response:
    print("Objects in bucket:")
    # Iterate over each object
    for obj in response['Contents']:
        print(obj['Key'])
else:
    print("No objects found in the bucket.")

Objects in bucket:
Census_GRID/2021/ESTAT_Census_2021_V2.csv
Census_GRID/2021/ESTAT_Census_2021_V2.gpkg
Census_GRID/2021/ESTAT_Census_2021_V2.parquet
Census_GRID/2021/ESTAT_Census_2021_V2_SDMX_Metadata/CENSUS_INS21ES_A_AT_2021_0000.sdmx.xml
Census_GRID/2021/ESTAT_Census_2021_V2_SDMX_Metadata/CENSUS_INS21ES_A_BE_2021_0000.sdmx.xml
Census_GRID/2021/ESTAT_Census_2021_V2_SDMX_Metadata/CENSUS_INS21ES_A_BG_2021_0000.sdmx.xml
Census_GRID/2021/ESTAT_Census_2021_V2_SDMX_Metadata/CENSUS_INS21ES_A_CH_2021_0000.sdmx.xml
Census_GRID/2021/ESTAT_Census_2021_V2_SDMX_Metadata/CENSUS_INS21ES_A_CY_2021_0000.sdmx.xml
Census_GRID/2021/ESTAT_Census_2021_V2_SDMX_Metadata/CENSUS_INS21ES_A_CZ_2021_0000.sdmx.xml
Census_GRID/2021/ESTAT_Census_2021_V2_SDMX_Metadata/CENSUS_INS21ES_A_DE_2021_0000.sdmx.xml
Census_GRID/2021/ESTAT_Census_2021_V2_SDMX_Metadata/CENSUS_INS21ES_A_DK_2021_0000.sdmx.xml
Census_GRID/2021/ESTAT_Census_2021_V2_SDMX_Metadata/CENSUS_INS21ES_A_EE_2021_0000.sdmx.xml
Census_GRID/2021/ESTAT_Census_2021_V2_SDMX_Metadata/CENSUS_INS21ES_A_EL_2021_0000.sdmx.xml
Census_GRID/2021/ESTAT_Census_2021_V2_SDMX_Metadata/CENSUS_INS21ES_A_ES_2021_0000.sdmx.xml
Census_GRID/2021/ESTAT_Census_2021_V2_SDMX_Metadata/CENSUS_INS21ES_A_FI_2021_0000.sdmx.xml
Census_GRID/2021/ESTAT_Census_2021_V2_SDMX_Metadata/CENSUS_INS21ES_A_FR_2021_0000.sdmx.xml
Census_GRID/2021/ESTAT_Census_2021_V2_SDMX_Metadata/CENSUS_INS21ES_A_HR_2021_0000.sdmx.xml
Census_GRID/2021/ESTAT_Census_2021_V2_SDMX_Metadata/CENSUS_INS21ES_A_HU_2021_0000.sdmx.xml
Census_GRID/2021/ESTAT_Census_2021_V2_SDMX_Metadata/CENSUS_INS21ES_A_IE_2021_0000.sdmx.xml
Census_GRID/2021/ESTAT_Census_2021_V2_SDMX_Metadata/CENSUS_INS21ES_A_IT_2021_0000.sdmx.xml
Census_GRID/2021/ESTAT_Census_2021_V2_SDMX_Metadata/CENSUS_INS21ES_A_LI_2021_0000.sdmx.xml
Census_GRID/2021/ESTAT_Census_2021_V2_SDMX_Metadata/CENSUS_INS21ES_A_LT_2021_0000.sdmx.xml
Census_GRID/2021/ESTAT_Census_2021_V2_SDMX_Metadata/CENSUS_INS21ES_A_LU_2021_0000.sdmx.xml
Census_GRID/2021/ESTAT_Census_2021_V2_SDMX_Metadata/CENSUS_INS21ES_A_LV_2021_0000.sdmx.xml
Census_GRID/2021/ESTAT_Census_2021_V2_SDMX_Metadata/CENSUS_INS21ES_A_MT_2021_0000.sdmx.xml
Census_GRID/2021/ESTAT_Census_2021_V2_SDMX_Metadata/CENSUS_INS21ES_A_NL_2021_0000.sdmx.xml
Census_GRID/2021/ESTAT_Census_2021_V2_SDMX_Metadata/CENSUS_INS21ES_A_NO_2021_0000.sdmx.xml
Census_GRID/2021/ESTAT_Census_2021_V2_SDMX_Metadata/CENSUS_INS21ES_A_PL_2021_0000.sdmx.xml
Census_GRID/2021/ESTAT_Census_2021_V2_SDMX_Metadata/CENSUS_INS21ES_A_PT_2021_0000.sdmx.xml
Census_GRID/2021/ESTAT_Census_2021_V2_SDMX_Metadata/CENSUS_INS21ES_A_RO_2021_0000.sdmx.xml
Census_GRID/2021/ESTAT_Census_2021_V2_SDMX_Metadata/CENSUS_INS21ES_A_SE_2021_0000.sdmx.xml
Census_GRID/2021/ESTAT_Census_2021_V2_SDMX_Metadata/CENSUS_INS21ES_A_SI_2021_0000.sdmx.xml
Census_GRID/2021/ESTAT_Census_2021_V2_SDMX_Metadata/CENSUS_INS21ES_A_SK_2021_0000.sdmx.xml
Census_GRID/2021/ESTAT_OBS-VALUE-CHG_IN_2021_V2.tiff
Census_GRID/2021/ESTAT_OBS-VALUE-CHG_OUT_2021_V2.tiff
Census_GRID/2021/ESTAT_OBS-VALUE-CONFIDENTIAL_2021_V2.tiff
Census_GRID/2021/ESTAT_OBS-VALUE-EMP_2021_V2.tiff
Census_GRID/2021/ESTAT_OBS-VALUE-EU_OTH_2021_V2.tiff
Census_GRID/2021/ESTAT_OBS-VALUE-F_2021_V2.tiff
Census_GRID/2021/ESTAT_OBS-VALUE-LANDSURFACE_2021_V2.tiff
Census_GRID/2021/ESTAT_OBS-VALUE-M_2021_V2.tiff
Census_GRID/2021/ESTAT_OBS-VALUE-NAT_2021_V2.tiff
Census_GRID/2021/ESTAT_OBS-VALUE-OTH_2021_V2.tiff
Census_GRID/2021/ESTAT_OBS-VALUE-POPULATED_2021_V2.tiff
Census_GRID/2021/ESTAT_OBS-VALUE-SAME_2021_V2.tiff
Census_GRID/2021/ESTAT_OBS-VALUE-T_2021_V2.tiff
Census_GRID/2021/ESTAT_OBS-VALUE-Y_1564_2021_V2.tiff
Census_GRID/2021/ESTAT_OBS-VALUE-Y_GE65_2021_V2.tiff
Census_GRID/2021/ESTAT_OBS-VALUE-Y_LT15_2021_V2.tiff
Census_GRID/2021/read.me

Reading vector file to GeoDataFrame#

%%time 

object_path = 'Census_GRID/2021/ESTAT_Census_2021_V2.gpkg'

# Create a temporary directory to store GeoPackage file
with tempfile.TemporaryDirectory() as tmpdirname:
    # Define local path to save GeoPackage file
    local_geopackage_path = os.path.join(tmpdirname, object_path.split('/')[-1])

    # Download the GeoPackage from S3
    s3.download_file('ESTAT', object_path, local_geopackage_path)

    # Read the GeoPackage into a GeoDataFrame
    gdf = gpd.read_file(local_geopackage_path)

CPU times: user 1min 6s, sys: 3.69 s, total: 1min 10s
Wall time: 1min 9s

# Geodata parameters
print(gdf.info())
print('----')
print(f'Coordinate system: {gdf.crs}')

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 4594018 entries, 0 to 4594017
Data columns (total 18 columns):
 #   Column              Dtype   
---  ------              -----   
 0   GRD_ID              object  
 1   T                   int64   
 2   M                   float64 
 3   F                   float64 
 4   Y_LT15              float64 
 5   Y_1564              float64 
 6   Y_GE65              float64 
 7   EMP                 float64 
 8   NAT                 float64 
 9   EU_OTH              float64 
 10  OTH                 float64 
 11  SAME                float64 
 12  CHG_IN              float64 
 13  CHG_OUT             float64 
 14  LAND_SURFACE        float64 
 15  POPULATED           int64   
 16  CONFIDENTIALSTATUS  float64 
 17  geometry            geometry
dtypes: float64(14), geometry(1), int64(2), object(1)
memory usage: 630.9+ MB
None
----
Coordinate system: EPSG:3035

gdf.head()

	GRD_ID	T	M	F	Y_LT15	Y_1564	Y_GE65	EMP	NAT	EU_OTH	OTH	SAME	CHG_IN	CHG_OUT	LAND_SURFACE	POPULATED	CONFIDENTIALSTATUS	geometry
0	CRS3035RES1000mN1000000E1966000	259	143.0	116.0	38.0	190.0	29.0	86.0	113.0	94.0	47.0	220.0	24.0	11.0	0.6115	1	NaN	POLYGON ((1966000 1000000, 1967000 1000000, 19...
1	CRS3035RES1000mN1000000E1967000	4	4.0	2.0	1.0	2.0	2.0	2.0	4.0	0.0	0.0	4.0	0.0	0.0	1.0000	1	NaN	POLYGON ((1967000 1000000, 1968000 1000000, 19...
2	CRS3035RES1000mN1000000E1968000	0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0000	0	NaN	POLYGON ((1968000 1000000, 1969000 1000000, 19...
3	CRS3035RES1000mN1000000E1969000	0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0000	0	NaN	POLYGON ((1969000 1000000, 1970000 1000000, 19...
4	CRS3035RES1000mN1000000E1970000	0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0000	0	NaN	POLYGON ((1970000 1000000, 1971000 1000000, 19...

GeoDataFrame explanation#

GeoDataFrame inherits most of pandas DataFrame methods. That allows to work with GeoDataFrame on the same way.

# Creating new GeoDataFrame with defined columns
gdf[['GRD_ID','POPULATED','geometry']]

	GRD_ID	POPULATED	geometry
0	CRS3035RES1000mN1000000E1966000	1	POLYGON ((1966000 1000000, 1967000 1000000, 19...
1	CRS3035RES1000mN1000000E1967000	1	POLYGON ((1967000 1000000, 1968000 1000000, 19...
2	CRS3035RES1000mN1000000E1968000	0	POLYGON ((1968000 1000000, 1969000 1000000, 19...
3	CRS3035RES1000mN1000000E1969000	0	POLYGON ((1969000 1000000, 1970000 1000000, 19...
4	CRS3035RES1000mN1000000E1970000	0	POLYGON ((1970000 1000000, 1971000 1000000, 19...
...	...	...	...
4594013	CRS3035RES1000mN999000E1981000	1	POLYGON ((1981000 999000, 1982000 999000, 1982...
4594014	CRS3035RES1000mN999000E1982000	1	POLYGON ((1982000 999000, 1983000 999000, 1983...
4594015	CRS3035RES1000mN999000E1983000	0	POLYGON ((1983000 999000, 1984000 999000, 1984...
4594016	CRS3035RES1000mN999000E1985000	0	POLYGON ((1985000 999000, 1986000 999000, 1986...
4594017	CRS3035RES1000mN999000E1986000	0	POLYGON ((1986000 999000, 1987000 999000, 1987...

4594018 rows × 3 columns

# Filtering records based on attribute value 
gdf[gdf['T']>20].head()

	GRD_ID	T	M	F	Y_LT15	Y_1564	Y_GE65	EMP	NAT	EU_OTH	OTH	SAME	CHG_IN	CHG_OUT	LAND_SURFACE	POPULATED	CONFIDENTIALSTATUS	geometry
0	CRS3035RES1000mN1000000E1966000	259	143.0	116.0	38.0	190.0	29.0	86.0	113.0	94.0	47.0	220.0	24.0	11.0	0.6115	1	NaN	POLYGON ((1966000 1000000, 1967000 1000000, 19...
14	CRS3035RES1000mN1000000E1980000	285	143.0	142.0	38.0	215.0	29.0	92.0	152.0	50.0	85.0	234.0	46.0	5.0	1.0000	1	NaN	POLYGON ((1980000 1000000, 1981000 1000000, 19...
15	CRS3035RES1000mN1000000E1981000	1828	973.0	856.0	198.0	1377.0	254.0	606.0	819.0	494.0	513.0	1465.0	267.0	90.0	0.6775	1	NaN	POLYGON ((1981000 1000000, 1982000 1000000, 19...
16	CRS3035RES1000mN1000000E1982000	139	74.0	65.0	24.0	95.0	21.0	34.0	53.0	58.0	29.0	104.0	27.0	8.0	0.0504	1	NaN	POLYGON ((1982000 1000000, 1983000 1000000, 19...
35	CRS3035RES1000mN1001000E1981000	7716	4028.0	3689.0	1062.0	5902.0	754.0	2659.0	3665.0	1440.0	2615.0	6281.0	1152.0	224.0	0.7921	1	NaN	POLYGON ((1981000 1001000, 1982000 1001000, 19...

Displaying geometries on basemap#

To display vector geometry on map we recommend folium. Folium allows displaying different types of geometries like Polygons, Lines and Points.
IMPORTANT: Each geometry presenting on map must be transformed to EPSG:4326 coordinates system

# Filtering many polygons
gdf_filter = gdf.loc[:100]

# Add the polygons to the map

m1 = folium.Map(location=[28.730442, -13.911504], zoom_start=13)

for _, r in gdf_filter.to_crs(4326).iterrows():
    sim_geo = gpd.GeoSeries(r["geometry"]).simplify(tolerance=0.001)
    geo_j = sim_geo.to_json()
    geo_j = folium.GeoJson(data=geo_j, style_function=lambda x: {"fillColor": "orange"})
    folium.Popup(r["GRD_ID"]).add_to(geo_j)
    geo_j.add_to(m1)

m1

Make this Notebook Trusted to load map: File -> Trust Notebook

Reading raster file#

Geographic information systems use GeoTIFF and other formats to organize and store gridded, or raster, datasets. Rasterio reads and writes these formats and provides a Python API based on N-D arrays.
More information: https://rasterio.readthedocs.io/en/stable/quickstart.html

# S3 reference to GeoTIFF
object_path = 'Census_GRID/2021/ESTAT_OBS-VALUE-POPULATED_2021_V2.tiff'

# Create a temporary directory to store the raster file
with tempfile.TemporaryDirectory() as tmpdirname:
    # Define local path to save raster file
    local_raster_path = os.path.join(tmpdirname, object_path.split('/')[-1])

    # Download raster from S3
    s3.download_file('ESTAT', object_path, local_raster_path)

    # Read raster into numpy array
    with rasterio.open(local_raster_path) as src:
        raster_data = src.read()
        raster_metadata = src.meta

# Printing metadata

print(src.meta) # Raster total metadata
print('---')
print(src.width, src.height) # Numner of rows and columns in raster
print(src.crs) # Coordinate system
print(src.transform) # Parameters of affine transformation
print(src.count) # Number of bands

{'driver': 'GTiff', 'dtype': 'float32', 'nodata': -1.0, 'width': 5562, 'height': 4475, 'count': 1, 'crs': CRS.from_wkt('PROJCS["ETRS89-extended / LAEA Europe",GEOGCS["ETRS89",DATUM["European_Terrestrial_Reference_System_1989",SPHEROID["GRS 1980",6378137,298.257222101,AUTHORITY["EPSG","7019"]],AUTHORITY["EPSG","6258"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4258"]],PROJECTION["Lambert_Azimuthal_Equal_Area"],PARAMETER["latitude_of_center",52],PARAMETER["longitude_of_center",10],PARAMETER["false_easting",4321000],PARAMETER["false_northing",3210000],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["Northing",NORTH],AXIS["Easting",EAST],AUTHORITY["EPSG","3035"]]'), 'transform': Affine(1000.0, 0.0, 943000.0,
       0.0, -1000.0, 5416000.0)}
---
5562 4475
EPSG:3035
| 1000.00, 0.00, 943000.00|
| 0.00,-1000.00, 5416000.00|
| 0.00, 0.00, 1.00|
1

# Raster data is read as numpy array
print(type(raster_data))
print(raster_data.shape)
raster_data

<class 'numpy.ndarray'>
(1, 4475, 5562)

array([[[-1., -1., -1., ..., -1., -1., -1.],
        [-1., -1., -1., ..., -1., -1., -1.],
        [-1., -1., -1., ..., -1., -1., -1.],
        ...,
        [-1., -1., -1., ..., -1., -1., -1.],
        [-1., -1., -1., ..., -1., -1., -1.],
        [-1., -1., -1., ..., -1., -1., -1.]]], dtype=float32)

Plotting raster file#

There are couple of options to plot raster file. Below we present most popular option using matplotlib.pyplot module and tool implemented in rasterio package. More information:

# Plotting definition
fig, ax = pyplot.subplots(1, figsize=(20,10))

# Plotting raster
pyplot.imshow(raster_data[0], cmap='pink')

# Plotting color bar
pyplot.colorbar(label='Parameter name', fraction=0.02, ax=ax)

<matplotlib.colorbar.Colorbar at 0x7f63603d8f40>

../_images/ca5eb2a89ac6ddb6aa61af66853684246b6d4d8896b7f6eb784d55cee58451bf.png

# Rasterio also provides rasterio.plot.show() to perform common tasks such as displaying 
# multi-band images as RGB and labeling the axes with proper geo-referenced extents.

# Plotting definition
fig, ax = pyplot.subplots(1, figsize=(20,10))

# Plotting raster with georeference
rs = show(raster_data[0], ax=ax, transform=src.transform)

# Plotting color bar
fig.colorbar(rs.get_images()[0], ax=ax, label='Parameter name')

<matplotlib.colorbar.Colorbar at 0x7f63620b2e30>

../_images/6a66a384d7b7ffa2238767ecd5cf7121eafb8a58d80ab2753bf127b1350f801c.png

Reading Parquet file to GeoDataFrame#

%%time

object_path = 'Census_GRID/2021/ESTAT_Census_2021_V2.parquet'

# Create a temporary directory to store parquet file
with tempfile.TemporaryDirectory() as tmpdirname:
    # Define local path to save parquet
    local_parquet_path = os.path.join(tmpdirname, object_path.split('/')[-1])

    # Download the parquet file from S3
    s3.download_file('ESTAT', object_path, local_parquet_path)

    # Read the parquet into a GeoDataFrame
    gdf = gpd.read_parquet(local_parquet_path)

CPU times: user 6.18 s, sys: 2.2 s, total: 8.38 s
Wall time: 7.42 s

gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 4594018 entries, 0 to 4594017
Data columns (total 18 columns):
 #   Column              Dtype   
---  ------              -----   
 GRD_ID              object  
 T                   int64   
 M                   float64 
 F                   float64 
 Y_LT15              float64 
 Y_1564              float64 
 Y_GE65              float64 
 EMP                 float64 
 NAT                 float64 
 EU_OTH              float64 
OTH                 float64 
SAME                float64 
CHG_IN              float64 
CHG_OUT             float64 
LAND_SURFACE        float64 
POPULATED           int64   
CONFIDENTIALSTATUS  float64 
geom                geometry
dtypes: float64(14), geometry(1), int64(2), object(1)
memory usage: 630.9+ MB

gdf_filter.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 GRD_ID              101 non-null    object  
 T                   101 non-null    int64   
 M                   101 non-null    float64 
 F                   101 non-null    float64 
 Y_LT15              101 non-null    float64 
 Y_1564              101 non-null    float64 
 Y_GE65              101 non-null    float64 
 EMP                 101 non-null    float64 
 NAT                 101 non-null    float64 
 EU_OTH              101 non-null    float64 
OTH                 101 non-null    float64 
SAME                101 non-null    float64 
CHG_IN              101 non-null    float64 
CHG_OUT             101 non-null    float64 
LAND_SURFACE        101 non-null    float64 
POPULATED           101 non-null    int64   
CONFIDENTIALSTATUS  0 non-null      float64 
geometry            101 non-null    geometry
dtypes: float64(14), geometry(1), int64(2), object(1)
memory usage: 14.3+ KB

Writing back data to the Read-Write S3 bucket#

from botocore.config import Config

custom_config = Config(
   request_checksum_calculation='WHEN_REQUIRED', 
   response_checksum_validation='WHEN_REQUIRED'
)

s3_client = boto3.client('s3', endpoint_url=credentials['host_base'],
                      aws_access_key_id=credentials['access_key'],
                      aws_secret_access_key=credentials['secret_key'],
                      use_ssl=True,
                      verify=False,
                      config=custom_config)

response = s3_client.list_buckets()
print(response['Buckets'])

[{'Name': 'estat-test', 'CreationDate': datetime.datetime(2025, 3, 4, 14, 28, 54, 502000, tzinfo=tzlocal())}]

from io import BytesIO

# Convert the GeoDataFrame to a Parquet file in-memory
buffer = BytesIO()
gdf_filter.to_parquet(buffer, engine='pyarrow')
buffer.seek(0)  # Move to the beginning of the buffer

# Specify your S3 bucket and file name
bucket_name = response['Buckets'][0]['Name']
file_name = 'teszt.parquet'

# Upload the file
s3_client.upload_fileobj(buffer, bucket_name, file_name)

response = s3.list_objects_v2(Bucket=bucket_name)
if 'Contents' in response:
    print("Objects in bucket:")
    # Iterate over each object
    for obj in response['Contents']:
        print(obj['Key'])
else:
    print("No objects found in the bucket.")

Objects in bucket:
teszt.parquet