Initial Partitioning (4 tiles) - Versioneer Research

split the original PASTIS dataset into 4 per-tile subsets, package each tile into a tar archive, and upload them individually to an S3 bucket.

dataset origin: https://github.com/VSainteuf/pastis-benchmark
data source: https://www.eotdl.com/datasets/PASTIS-HD

#pip install boto3==1.35.95 botocore==1.35.95 python-dotenv

from dotenv import load_dotenv
load_dotenv("pastis.env")

True

import os
import json
import numpy as np
import xarray as xr
from pathlib import Path
from typing import Dict, List, Tuple

def load_geojson_ids_and_tiles(path: Path, max_items: int = None) -> Dict[str, str]:
    with open(path, "r", encoding="utf-8") as f:
        geojson = json.load(f)

    features = geojson.get("features", [])
    if max_items is not None:
        features = features[:max_items]

    id_to_tile = {}
    for feature in features:
        id_ = feature["id"]
        tile = feature.get("properties", {}).get("TILE", "UNKNOWN")
        id_to_tile[id_] = tile
    return id_to_tile

def find_files_with_ids_and_tiles(base_path: Path, id_to_tile: Dict[str, str]) -> Dict[str, List[Tuple[str, Path]]]:
    results = {id_: [] for id_ in id_to_tile}

    for root, _, files in os.walk(base_path):
        for file in files:
            if 'aux' in file.lower() or file.lower().startswith('zones_'):
                continue
            for id_ in id_to_tile:
                if id_ in file:
                    full_path = Path(root) / file
                    rel_path = full_path.relative_to(base_path)
                    tile = id_to_tile[id_]
                    arcname = str(Path(tile) / id_ / rel_path.name)
                    results[id_].append((arcname, full_path))
    return results

BASE_DIR = Path(os.getenv("BASE_DIR"))
id_to_tile = load_geojson_ids_and_tiles(Path(os.getenv("BASE_DIR")) / "metadata_pastis.geojson") #, 100) # 2433 in total)
matched_files = find_files_with_ids_and_tiles(BASE_DIR, id_to_tile)

from collections import defaultdict

tile_counts = defaultdict(int)

for id_, path_pairs in matched_files.items():
    if not path_pairs:
        continue

    # Extract tile name from the arcname (assumes structure TILE/id_/filename)
    first_arcname = path_pairs[0][0]
    tile = Path(first_arcname).parts[0]
    tile_counts[tile] += 1

print("\n📦 Patch count per TILE:")
for tile, count in sorted(tile_counts.items()):
    print(f" - {tile}: {count} patches")

print(f"\n🔢 Total unique tiles: {len(tile_counts)}")


📦 Patch count per TILE:
 - t30uxv: 531 patches
 - t31tfj: 623 patches
 - t31tfm: 723 patches
 - t32ulu: 556 patches

🔢 Total unique tiles: 4

from collections import defaultdict
import os
import boto3
import botocore
import tarfile
import tempfile
from pathlib import Path
from botocore.config import Config
from botocore.exceptions import ClientError

#print("boto3 version:", boto3.__version__)
#print("botocore version:", botocore.__version__)

BUCKET_NAME = os.getenv("BUCKET_NAME")
BUCKET_PREFIX = os.getenv("BUCKET_PREFIX", "")
AWS_REGION = os.getenv("AWS_REGION", "")
AWS_ENDPOINT_URL = os.getenv("AWS_ENDPOINT_URL")
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")

#print(BUCKET_NAME)

missing = [v for v in ["BUCKET_NAME", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"] if not os.getenv(v)]
if missing:
    raise EnvironmentError(f"Missing required env vars: {', '.join(missing)}")

boto_config = Config(
    s3={'addressing_style': 'path'},
    retries={'max_attempts': 3},
    signature_version='s3v4'
)

s3 = boto3.client(
    "s3",
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    endpoint_url=AWS_ENDPOINT_URL,
    region_name=AWS_REGION,
    config=boto_config
)

try:
    s3.head_bucket(Bucket=BUCKET_NAME)

    test_key = f"{BUCKET_PREFIX.rstrip('/')}/test.txt" if BUCKET_PREFIX else "test.txt"
    s3.put_object(Bucket=BUCKET_NAME, Key=test_key, Body=b"test")
    s3.delete_object(Bucket=BUCKET_NAME, Key=test_key)

    print(f"✅ Bucket '{BUCKET_NAME}' is accessible and writable.")
except Exception as e:
    raise RuntimeError(f"❌ Bucket check failed: {e}")

tile_to_paths = defaultdict(list)
for id_, path_pairs in matched_files.items():
    for arcname, path in path_pairs:
        tile = Path(arcname).parts[0]
        tile_to_paths[tile].append((arcname, path))

skipped = 0
uploaded = 0

for tile, file_list in tile_to_paths.items():
    if not file_list:
        continue

    s3_key = f"{BUCKET_PREFIX.rstrip('/')}/{tile}.tar" if BUCKET_PREFIX else f"{tile}.tar"

    try:
        s3.head_object(Bucket=BUCKET_NAME, Key=s3_key)
        print(f"⏭️ Skipping {tile}: already exists in S3.")
        skipped += 1
        continue
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] != "404":
            raise RuntimeError(f"❌ Error checking existence of {s3_key}: {e}")

    print(f"- {tile}: {len(file_list)} files")
    with tempfile.NamedTemporaryFile(suffix=".tar", delete=True) as tmp_tar:
        with tarfile.open(tmp_tar.name, "w") as tar:
            for arcname, path in file_list:
                tar.add(Path(path), arcname=arcname)
                
        s3.upload_file(tmp_tar.name, BUCKET_NAME, s3_key)
        url_display = f"{AWS_ENDPOINT_URL}/{BUCKET_NAME}/{s3_key}"
        uploaded += 1
        print(f"✅ Uploaded: {url_display} ({len(file_list)} files)")

print(f"\n📦 Tiles uploaded: {uploaded} (skipped: {skipped})")

✅ Bucket 'versioneer-papers' is accessible and writable.
⏭️ Skipping t30uxv: already exists in S3.
- t31tfj: 4984 files
✅ Uploaded: https://s3.de.io.cloud.ovh.net/versioneer-papers/pastis/t31tfj.tar (4984 files)
- t31tfm: 5784 files
✅ Uploaded: https://s3.de.io.cloud.ovh.net/versioneer-papers/pastis/t31tfm.tar (5784 files)
- t32ulu: 4448 files
✅ Uploaded: https://s3.de.io.cloud.ovh.net/versioneer-papers/pastis/t32ulu.tar (4448 files)

📦 Tiles uploaded: 3 (skipped: 1)

Versioneer Research

Build & Push (2433 patches)

Versioneer Research

Generate Config (4 tiles)