import boto3
import configparser
import os
import urllib3

import folium
import geopandas as gpd
import pandas as pd
import pyarrow.parquet as pq
import rasterio
from rasterio.plot import show
import numpy as np
from matplotlib import pyplot
from shapely import wkb
import contextily as cx

import tempfile
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[1], line 1
----> 1 import boto3
      2 import configparser
      3 import os

ModuleNotFoundError: No module named 'boto3'
urllib3.disable_warnings()

Connection with S3 Bucket#

The field boundary dataset is available on the S3 Bucket. Below configuration allows to list and download the dataset from there.

def s3_connection(credentials: dict) -> boto3.session.Session:
    """Establishes a connection to an S3 bucket.

    Args:
        credentials (dict): A dictionary containing AWS S3 credentials with keys 
                            'host_base', 'access_key', and 'secret_key'.

    Returns:
        boto3.session.Session: A boto3 session client configured with the provided 
                               credentials for interacting with the S3 service.
    """
    s3 = boto3.client('s3',
                      endpoint_url=credentials['host_base'],
                      aws_access_key_id=credentials['access_key'],
                      aws_secret_access_key=credentials['secret_key'],
                      use_ssl=True,
                      verify=False)
    return s3

# Load s3 credentials
config = configparser.ConfigParser()
config.read('/home/eouser/.s3cfg')
credentials = dict(config['default'].items())

# Connection with S3 eodata
s3 = s3_connection(credentials)

Browsing S3 bucket content#

response = s3.list_objects_v2(Bucket='ESTAT', Prefix='Field_boundaries')
if 'Contents' in response:
    print("Objects in bucket:")
    # Iterate over each object
    for obj in response['Contents']:
        print(obj['Key'])
else:
    print("No objects found in the bucket.")
Objects in bucket:
Field_boundaries/field_boundaries.parquet

Reading Parquet file to GeoDataFrame#

First download the parquet file to the server.

%%time 

object_path = 'Field_boundaries/field_boundaries.parquet'

    # Define local path to save parquet
local_parquet_path = os.path.join('/home/eouser', object_path.split('/')[-1])

    # Download the parquet file from S3
s3.download_file('ESTAT', object_path, local_parquet_path)
CPU times: user 40.6 s, sys: 31.7 s, total: 1min 12s
Wall time: 56.1 s

As the parquet file is very large above 10GB it is not possible load all into memory you have to read and work in batches.

# Load the file using pyarrow
file_path = local_parquet_path
parquet_file = pq.ParquetFile(file_path)

# Define your batch size or the number of rows you want
n_rows = 100

# Use pyarrow to read batches
batches = parquet_file.iter_batches(batch_size=n_rows)

# Get the first chunk
first_batch = next(batches)

# Convert to a GeoDataFrame
gdf = gpd.GeoDataFrame(first_batch.to_pandas())

gdf.info()
gdf.head()
<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype              
---  ------                  --------------  -----              
 0   id                      100 non-null    object             
 1   area                    100 non-null    Float32            
 2   geometry                100 non-null    object             
 3   determination_datetime  100 non-null    datetime64[ms, UTC]
 4   planet:ca_ratio         100 non-null    float32            
 5   planet:micd             100 non-null    float32            
 6   planet:qa               100 non-null    uint8              
 7   determination_method    100 non-null    string             
 8   bbox                    100 non-null    object             
dtypes: Float32(1), datetime64[ms, UTC](1), float32(2), object(3), string(1), uint8(1)
memory usage: 5.4+ KB
id area geometry determination_datetime planet:ca_ratio planet:micd planet:qa determination_method bbox
0 35291833 0.270757 b"\x01\x03\x00\x00\x00\x01\x00\x00\x00\x0b\x00... 2022-06-01 00:00:00+00:00 0.391210 46.104290 0 auto-imagery {'xmin': 18.72719492257748, 'ymin': 45.8199928...
1 35291842 0.627083 b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\r\x00\x... 2022-06-01 00:00:00+00:00 0.553884 76.972801 0 auto-imagery {'xmin': 18.701706606680116, 'ymin': 45.819452...
2 35291851 0.143134 b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\n\x00\x... 2022-06-01 00:00:00+00:00 0.524641 33.431023 0 auto-imagery {'xmin': 18.701139716917233, 'ymin': 45.819363...
3 35291869 10.217677 b"\x01\x03\x00\x00\x00\x01\x00\x00\x00\x1e\x00... 2022-06-01 00:00:00+00:00 2.750239 278.701172 0 auto-imagery {'xmin': 18.715886959814554, 'ymin': 45.819687...
4 35291883 1.706494 b"\x01\x03\x00\x00\x00\x01\x00\x00\x00\r\x00\x... 2022-06-01 00:00:00+00:00 1.013447 97.955658 0 auto-imagery {'xmin': 18.695882937052108, 'ymin': 45.818939...

If we want to filter we have to iterate through the parquet file. It can take several minutes.

%%time

filtered_data = []

# The bounding box coordinates of the area of interest
min_lon, min_lat = 21.633179, 47.995114
max_lon, max_lat = 21.684402, 48.027131


# Iterate through row groups (if defined, leads to chunk-wise processing)
for i in range(parquet_file.num_row_groups):
    # Read each row group one by one
    table = parquet_file.read_row_group(i)
    df = table.to_pandas()
    # df['xmin'] = df['bbox'].apply(lambda x: x['xmin'])
    # df['ymin'] = df['bbox'].apply(lambda x: x['ymin'])

    # Apply the intersection filtering to each row group
    filtered_df = df[df['bbox'].apply(lambda bbox: bbox['xmin'] >= min_lon and bbox['xmin'] <= max_lon and bbox['ymin'] >= min_lat and bbox['ymin'] <= max_lat)]
    # Append the filtered result
    filtered_data.append(filtered_df)

# Combine all filtered data into a single DataFrame
combined_df = pd.concat(filtered_data, ignore_index=True)

# Display or process the combined filtered data
print(combined_df)
           id      area                                           geometry  \
0    33103797  1.410728  b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x10\x00...   
1    33103836  4.367515  b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x19\x00...   
2    33103861  0.399203  b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\r\x00\x...   
3    33103962  0.550601  b"\x01\x03\x00\x00\x00\x01\x00\x00\x00\x10\x00...   
4    33104053  0.671882  b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x0c\x00...   
..        ...       ...                                                ...   
165  33135954  2.638525  b'\x01\x03\x00\x00\x00\x02\x00\x00\x00\x15\x00...   
166  33135963  7.261222  b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x1d\x00...   
167  33135980  3.019398  b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x15\x00...   
168  33135989    1.4842  b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x12\x00...   
169  33135993   2.29657  b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x1b\x00...   

       determination_datetime  planet:ca_ratio  planet:micd  planet:qa  \
0   2022-06-01 00:00:00+00:00         2.283350    68.912842          0   
1   2022-06-01 00:00:00+00:00         1.188137   170.421906          0   
2   2022-06-01 00:00:00+00:00         1.886232    40.514462          0   
3   2022-06-01 00:00:00+00:00         2.049192    43.568317          0   
4   2022-06-01 00:00:00+00:00         3.209024    55.889256          0   
..                        ...              ...          ...        ...   
165 2022-06-01 00:00:00+00:00         5.415477    92.389351          0   
166 2022-06-01 00:00:00+00:00         2.226575   188.741455          0   
167 2022-06-01 00:00:00+00:00         3.350346    87.315773          0   
168 2022-06-01 00:00:00+00:00         1.210054    97.991577          0   
169 2022-06-01 00:00:00+00:00         1.876090   122.943253          0   

    determination_method                                               bbox  
0           auto-imagery  {'xmin': 21.642864215506396, 'ymin': 48.008634...  
1           auto-imagery  {'xmin': 21.659535542623118, 'ymin': 48.008476...  
2           auto-imagery  {'xmin': 21.642631946956918, 'ymin': 48.008351...  
3           auto-imagery  {'xmin': 21.651274940417398, 'ymin': 48.007721...  
4           auto-imagery  {'xmin': 21.633483640778774, 'ymin': 48.002284...  
..                   ...                                                ...  
165         auto-imagery  {'xmin': 21.67613986305151, 'ymin': 48.0033189...  
166         auto-imagery  {'xmin': 21.676158582571418, 'ymin': 48.005186...  
167         auto-imagery  {'xmin': 21.675741599269546, 'ymin': 48.011326...  
168         auto-imagery  {'xmin': 21.676803684966814, 'ymin': 48.015685...  
169         auto-imagery  {'xmin': 21.677244014436575, 'ymin': 48.020335...  

[170 rows x 9 columns]
CPU times: user 1min 57s, sys: 42.3 s, total: 2min 39s
Wall time: 2min 56s

We were working as normal dataframe, we have to convert it into geo dataframe and convert the binary cooridantes into polygons to able to map.

gdf = gpd.GeoDataFrame(combined_df)
gdf['geometry'] = gdf['geometry'].apply(lambda x: wkb.loads(x))

We should define the geometry column and the CRS.

gdf = gdf.set_geometry("geometry")
gdf = gdf.set_crs(epsg=4326)
gdf.crs
<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

Displaying geometries on basemap#

To display vector geometry on map we recommend folium. Folium allows displaying different types of geometries like Polygons, Lines and Points.
IMPORTANT: Each geometry presenting on map must be transformed to EPSG:4326 coordinates system

ax = gdf.plot(alpha=0.5, edgecolor="k", figsize = (15,15))
cx.add_basemap(ax, crs=gdf.crs)
../_images/7f7d5e041a0c232267dc7eac5a0dc5a6c46c6bb711bee330518f5c3eaa5ff4ce.png
# Add the polygons to the map

m1 = folium.Map(location=[(min_lat+max_lat)/2, (min_lon+max_lon)/2], zoom_start=14)

for _, r in gdf.to_crs(4326).iterrows():
    sim_geo = gpd.GeoSeries(r["geometry"]).simplify(tolerance=0.001)
    geo_j = sim_geo.to_json()
    geo_j = folium.GeoJson(data=geo_j, style_function=lambda x: {"fillColor": "orange"})
    folium.Popup(r["area"]).add_to(geo_j)
    geo_j.add_to(m1)

m1
Make this Notebook Trusted to load map: File -> Trust Notebook
# Filtering many polygons
gdf_filter = gdf.loc[:100]
gdf_filter
id area geometry determination_datetime planet:ca_ratio planet:micd planet:qa determination_method bbox
0 33103797 1.410728 POLYGON ((21.64534 48.00863, 21.64521 48.00866... 2022-06-01 00:00:00+00:00 2.283350 68.912842 0 auto-imagery {'xmin': 21.642864215506396, 'ymin': 48.008634...
1 33103836 4.367515 POLYGON ((21.65959 48.00899, 21.65954 48.00916... 2022-06-01 00:00:00+00:00 1.188137 170.421906 0 auto-imagery {'xmin': 21.659535542623118, 'ymin': 48.008476...
2 33103861 0.399203 POLYGON ((21.64308 48.00857, 21.6427 48.00862,... 2022-06-01 00:00:00+00:00 1.886232 40.514462 0 auto-imagery {'xmin': 21.642631946956918, 'ymin': 48.008351...
3 33103962 0.550601 POLYGON ((21.65145 48.00773, 21.65134 48.00781... 2022-06-01 00:00:00+00:00 2.049192 43.568317 0 auto-imagery {'xmin': 21.651274940417398, 'ymin': 48.007721...
4 33104053 0.671882 POLYGON ((21.63351 48.00228, 21.63348 48.00236... 2022-06-01 00:00:00+00:00 3.209024 55.889256 0 auto-imagery {'xmin': 21.633483640778774, 'ymin': 48.002284...
... ... ... ... ... ... ... ... ... ...
96 33133311 1.299675 POLYGON ((21.65975 48.01465, 21.65966 48.01474... 2022-06-01 00:00:00+00:00 1.445122 95.394447 0 auto-imagery {'xmin': 21.659524428070384, 'ymin': 48.014622...
97 33133422 0.519171 POLYGON ((21.65974 48.0134, 21.65968 48.01343,... 2022-06-01 00:00:00+00:00 1.134290 61.894398 0 auto-imagery {'xmin': 21.65954661555796, 'ymin': 48.0133984...
98 33133430 1.388173 POLYGON ((21.6626 48.01012, 21.66283 48.01053,... 2022-06-01 00:00:00+00:00 3.713485 56.272381 0 auto-imagery {'xmin': 21.661560622709533, 'ymin': 48.008256...
99 33133453 0.269154 POLYGON ((21.64023 48.0132, 21.63986 48.01326,... 2022-06-01 00:00:00+00:00 1.798139 39.465385 0 auto-imagery {'xmin': 21.639141842895555, 'ymin': 48.013203...
100 33133480 5.920899 POLYGON ((21.64422 48.0114, 21.6444 48.01141, ... 2022-06-01 00:00:00+00:00 6.071929 135.380524 0 auto-imagery {'xmin': 21.643358597810153, 'ymin': 48.009700...

101 rows × 9 columns