split the original PASTIS dataset into 4 per-tile subsets, package each tile into a tar archive, and upload them individually to an S3 bucket.
- dataset origin: https://
github .com /VSainteuf /pastis -benchmark - data source: https://
www .eotdl .com /datasets /PASTIS -HD
#pip install boto3==1.35.95 botocore==1.35.95 python-dotenvfrom dotenv import load_dotenv
load_dotenv("pastis.env")Trueimport os
import json
import numpy as np
import xarray as xr
from pathlib import Path
from typing import Dict, List, Tupledef load_geojson_ids_and_tiles(path: Path, max_items: int = None) -> Dict[str, str]:
with open(path, "r", encoding="utf-8") as f:
geojson = json.load(f)
features = geojson.get("features", [])
if max_items is not None:
features = features[:max_items]
id_to_tile = {}
for feature in features:
id_ = feature["id"]
tile = feature.get("properties", {}).get("TILE", "UNKNOWN")
id_to_tile[id_] = tile
return id_to_tile
def find_files_with_ids_and_tiles(base_path: Path, id_to_tile: Dict[str, str]) -> Dict[str, List[Tuple[str, Path]]]:
results = {id_: [] for id_ in id_to_tile}
for root, _, files in os.walk(base_path):
for file in files:
if 'aux' in file.lower() or file.lower().startswith('zones_'):
continue
for id_ in id_to_tile:
if id_ in file:
full_path = Path(root) / file
rel_path = full_path.relative_to(base_path)
tile = id_to_tile[id_]
arcname = str(Path(tile) / id_ / rel_path.name)
results[id_].append((arcname, full_path))
return results
BASE_DIR = Path(os.getenv("BASE_DIR"))
id_to_tile = load_geojson_ids_and_tiles(Path(os.getenv("BASE_DIR")) / "metadata_pastis.geojson") #, 100) # 2433 in total)
matched_files = find_files_with_ids_and_tiles(BASE_DIR, id_to_tile)from collections import defaultdict
tile_counts = defaultdict(int)
for id_, path_pairs in matched_files.items():
if not path_pairs:
continue
# Extract tile name from the arcname (assumes structure TILE/id_/filename)
first_arcname = path_pairs[0][0]
tile = Path(first_arcname).parts[0]
tile_counts[tile] += 1
print("\n📦 Patch count per TILE:")
for tile, count in sorted(tile_counts.items()):
print(f" - {tile}: {count} patches")
print(f"\n🔢 Total unique tiles: {len(tile_counts)}")
📦 Patch count per TILE:
- t30uxv: 531 patches
- t31tfj: 623 patches
- t31tfm: 723 patches
- t32ulu: 556 patches
🔢 Total unique tiles: 4
from collections import defaultdict
import os
import boto3
import botocore
import tarfile
import tempfile
from pathlib import Path
from botocore.config import Config
from botocore.exceptions import ClientError
#print("boto3 version:", boto3.__version__)
#print("botocore version:", botocore.__version__)
BUCKET_NAME = os.getenv("BUCKET_NAME")
BUCKET_PREFIX = os.getenv("BUCKET_PREFIX", "")
AWS_REGION = os.getenv("AWS_REGION", "")
AWS_ENDPOINT_URL = os.getenv("AWS_ENDPOINT_URL")
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
#print(BUCKET_NAME)
missing = [v for v in ["BUCKET_NAME", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"] if not os.getenv(v)]
if missing:
raise EnvironmentError(f"Missing required env vars: {', '.join(missing)}")
boto_config = Config(
s3={'addressing_style': 'path'},
retries={'max_attempts': 3},
signature_version='s3v4'
)
s3 = boto3.client(
"s3",
aws_access_key_id=AWS_ACCESS_KEY_ID,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
endpoint_url=AWS_ENDPOINT_URL,
region_name=AWS_REGION,
config=boto_config
)
try:
s3.head_bucket(Bucket=BUCKET_NAME)
test_key = f"{BUCKET_PREFIX.rstrip('/')}/test.txt" if BUCKET_PREFIX else "test.txt"
s3.put_object(Bucket=BUCKET_NAME, Key=test_key, Body=b"test")
s3.delete_object(Bucket=BUCKET_NAME, Key=test_key)
print(f"✅ Bucket '{BUCKET_NAME}' is accessible and writable.")
except Exception as e:
raise RuntimeError(f"❌ Bucket check failed: {e}")
tile_to_paths = defaultdict(list)
for id_, path_pairs in matched_files.items():
for arcname, path in path_pairs:
tile = Path(arcname).parts[0]
tile_to_paths[tile].append((arcname, path))
skipped = 0
uploaded = 0
for tile, file_list in tile_to_paths.items():
if not file_list:
continue
s3_key = f"{BUCKET_PREFIX.rstrip('/')}/{tile}.tar" if BUCKET_PREFIX else f"{tile}.tar"
try:
s3.head_object(Bucket=BUCKET_NAME, Key=s3_key)
print(f"⏭️ Skipping {tile}: already exists in S3.")
skipped += 1
continue
except botocore.exceptions.ClientError as e:
if e.response['Error']['Code'] != "404":
raise RuntimeError(f"❌ Error checking existence of {s3_key}: {e}")
print(f"- {tile}: {len(file_list)} files")
with tempfile.NamedTemporaryFile(suffix=".tar", delete=True) as tmp_tar:
with tarfile.open(tmp_tar.name, "w") as tar:
for arcname, path in file_list:
tar.add(Path(path), arcname=arcname)
s3.upload_file(tmp_tar.name, BUCKET_NAME, s3_key)
url_display = f"{AWS_ENDPOINT_URL}/{BUCKET_NAME}/{s3_key}"
uploaded += 1
print(f"✅ Uploaded: {url_display} ({len(file_list)} files)")
print(f"\n📦 Tiles uploaded: {uploaded} (skipped: {skipped})")✅ Bucket 'versioneer-papers' is accessible and writable.
⏭️ Skipping t30uxv: already exists in S3.
- t31tfj: 4984 files
✅ Uploaded: https://s3.de.io.cloud.ovh.net/versioneer-papers/pastis/t31tfj.tar (4984 files)
- t31tfm: 5784 files
✅ Uploaded: https://s3.de.io.cloud.ovh.net/versioneer-papers/pastis/t31tfm.tar (5784 files)
- t32ulu: 4448 files
✅ Uploaded: https://s3.de.io.cloud.ovh.net/versioneer-papers/pastis/t32ulu.tar (4448 files)
📦 Tiles uploaded: 3 (skipped: 1)