Initial commit

This commit is contained in:
2025-08-28 22:38:53 +02:00
commit f15208fe6d
232 changed files with 16821 additions and 0 deletions

33
vogelwarte/pom.xml Normal file
View File

@@ -0,0 +1,33 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>ch.gtache.fro</groupId>
<artifactId>fro</artifactId>
<version>1.0.0-SNAPSHOT</version>
</parent>
<artifactId>fro-vogelwarte</artifactId>
<dependencies>
<dependency>
<groupId>ch.gtache.fro</groupId>
<artifactId>fro-selenium</artifactId>
</dependency>
<dependency>
<groupId>com.google.dagger</groupId>
<artifactId>dagger</artifactId>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
</project>

View File

@@ -0,0 +1,15 @@
package ch.gtache.fro.modules.vogelwarte;
import ch.gtache.fro.Fetcher;
import ch.gtache.fro.vogelwarte.VogelwarteFetcher;
import dagger.Binds;
import dagger.Module;
import dagger.multibindings.IntoSet;
@Module
public abstract class VogelwarteModule {
@Binds
@IntoSet
abstract Fetcher bindsFetcher(VogelwarteFetcher fetcher);
}

View File

@@ -0,0 +1,188 @@
package ch.gtache.fro.vogelwarte;
import ch.gtache.fro.Bird;
import ch.gtache.fro.BirdProvider;
import ch.gtache.fro.Configuration;
import ch.gtache.fro.FetchException;
import ch.gtache.fro.Fetcher;
import ch.gtache.fro.PictureType;
import ch.gtache.fro.SoundType;
import ch.gtache.fro.selenium.AbstractSeleniumFetcher;
import jakarta.inject.Inject;
import jakarta.inject.Singleton;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.HttpStatusException;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.openqa.selenium.By;
import java.io.IOException;
import java.util.HashSet;
import java.util.Locale;
import java.util.SequencedCollection;
import java.util.Set;
import java.util.regex.Pattern;
/**
* Implementation of {@link Fetcher} for vogelwarte.ch
*/
@Singleton
public class VogelwarteFetcher extends AbstractSeleniumFetcher {
private static final Logger logger = LogManager.getLogger(VogelwarteFetcher.class);
private static final String BASE_URL = "https://www.vogelwarte.ch";
private static final Pattern IMAGE_PATTERN = Pattern.compile("https://www\\.vogelwarte\\.ch/wp-content/(?:assets/images/bird/species|uploads/\\d{4}/\\d{2})/(?<id>\\d+(?:_\\d+)?)\\.jpg");
@Inject
VogelwarteFetcher(final BirdProvider birdProvider, final Configuration configuration) {
super(birdProvider, configuration);
}
@Override
protected void waitFor() throws IOException {
while (driver().findElements(By.cssSelector("main img")).isEmpty()) {
try {
Thread.sleep(100L);
} catch (final InterruptedException e) {
Thread.currentThread().interrupt();
throw new IOException(e);
}
}
try {
Thread.sleep(1000L);
} catch (final InterruptedException e) {
Thread.currentThread().interrupt();
throw new IOException(e);
}
}
@Override
public String name() {
return "Vogelwarte";
}
private static String getSpecialBirdName(final String birdName) {
return switch (birdName) {
case "mesange-boreale" -> "mesange-boreale-alpestre-ou-des-saules";
default -> birdName;
};
}
@Override
protected void download(final Bird bird) throws FetchException {
final var birdName = getSpecialBirdName(bird.name().toLowerCase(Locale.ROOT).replace("_", "-"));
final var url = BASE_URL + "/fr/les-oiseaux-de-suisse/" + birdName;
logger.info("Trying to download {}", url);
try {
final var document = getDocument(url);
final var seen = new HashSet<String>();
final var ogImages = document.select("meta[property=og:image]");
final var id = getId(ogImages);
saveImages(bird, document, id, seen);
savePreloadImages(bird, document, id, seen);
saveOGImages(bird, document, id, seen);
downloadSound(bird, id);
} catch (final IOException e) {
throw new FetchException("Failed to parse and download " + url, e);
}
}
private void saveImages(final Bird bird, final Document document, final String id,
final Set<? super String> seen) throws IOException {
final var images = document.select("main img");
for (final var image : images) {
saveImage(bird, image, id, seen);
}
}
private void saveImage(final Bird bird, final Element image, final String id,
final Set<? super String> seen) throws IOException {
final var src = image.attr("src");
final var imageLocation = BASE_URL + src;
final var matcher = IMAGE_PATTERN.matcher(imageLocation);
if (matcher.matches()) {
final var imageId = matcher.group("id");
if (imageId.startsWith(id) && seen.add(imageId)) {
final var alt = image.attr("alt");
final var pictureType = alt.startsWith("sous-espèce") ? PictureType.SUB_SPECIES : switch (alt) {
case "adulte" -> PictureType.ADULT;
case "plumage juvénile" -> PictureType.JUVENILE;
case "plumage nuptial" -> PictureType.ADULT_NUPTIAL;
case "plumage internuptial" -> PictureType.ADULT_INTERNUPTIAL;
case "mâle" -> PictureType.ADULT_MALE;
case "mâle nuptial" -> PictureType.ADULT_NUPTIAL_MALE;
case "mâle internuptial" -> PictureType.ADULT_INTERNUPTIAL_MALE;
case "femelle" -> PictureType.ADULT_FEMALE;
case "forme claire" -> PictureType.LIGHT_MORPH;
case "forme sombre" -> PictureType.DARK_MORPH;
default -> PictureType.UNKNOWN;
};
downloadImage(bird, imageLocation, pictureType);
}
}
}
private void saveOGImages(final Bird bird, final Document document, final String id,
final Set<? super String> seen) throws IOException {
final var ogImages = document.select("meta[property=og:image]");
for (final var ogImage : ogImages) {
saveOGImage(bird, ogImage, id, seen);
}
}
private void savePreloadImages(final Bird bird, final Document document, final String id,
final Set<? super String> seen) throws IOException {
final var preloadImages = document.select("link[rel=preload]");
for (final var preloadImage : preloadImages) {
savePreloadImage(bird, preloadImage, id, seen);
}
}
private void savePreloadImage(final Bird bird, final Element preloadImage, final String id,
final Set<? super String> seen) throws IOException {
final var href = preloadImage.attr("href");
final var matcher = IMAGE_PATTERN.matcher(href);
if (matcher.matches()) {
final var imageId = matcher.group("id");
if (imageId.startsWith(id) && seen.add(imageId)) {
downloadImage(bird, href, PictureType.UNKNOWN);
}
}
}
private void saveOGImage(final Bird bird, final Element ogImage, final String id,
final Set<? super String> seen) throws IOException {
final var src = ogImage.attr("content");
final var matcher = IMAGE_PATTERN.matcher(src);
if (matcher.matches()) {
final var imageId = matcher.group("id");
if (imageId.startsWith(id) && seen.add(imageId)) {
downloadImage(bird, src, PictureType.UNKNOWN);
}
}
}
private void downloadSound(final Bird bird, final CharSequence id) throws IOException {
final var url = BASE_URL + "/wp-content/assets/media/voices/" + id + ".mp3";
logger.info("Trying to download {}", url);
try {
downloadSound(bird, url, SoundType.SONG);
} catch (final HttpStatusException _) {
final var fallbackId = "0".repeat(4 - id.length()) + id;
if (!fallbackId.contentEquals(id)) {
downloadSound(bird, fallbackId);
}
}
}
private static String getId(final SequencedCollection<? extends Element> images) {
final var image = images.getFirst();
final var src = image.attr("content");
final var split = src.split("/");
final var lastSegment = split[split.length - 1];
final var fullId = lastSegment.split("\\.")[0];
return fullId.split("_")[0];
}
}

View File

@@ -0,0 +1,14 @@
/**
* Module for vogelwarte.ch fetcher
*/
module ch.gtache.fro.vogelwarte {
requires transitive ch.gtache.fro.selenium;
requires org.jsoup;
requires java.net.http;
requires jakarta.inject;
requires org.apache.logging.log4j;
requires dagger;
requires java.compiler;
requires org.seleniumhq.selenium.api;
exports ch.gtache.fro.vogelwarte;
}

View File

@@ -0,0 +1,8 @@
package ch.gtache.fro.vogelwarte;
final class TestVogelwarteFetcher {
private TestVogelwarteFetcher() {
}
}