Initial commit
This commit is contained in:
33
vogelwarte/pom.xml
Normal file
33
vogelwarte/pom.xml
Normal file
@@ -0,0 +1,33 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<groupId>ch.gtache.fro</groupId>
|
||||
<artifactId>fro</artifactId>
|
||||
<version>1.0.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>fro-vogelwarte</artifactId>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>ch.gtache.fro</groupId>
|
||||
<artifactId>fro-selenium</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.dagger</groupId>
|
||||
<artifactId>dagger</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.logging.log4j</groupId>
|
||||
<artifactId>log4j-api</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.logging.log4j</groupId>
|
||||
<artifactId>log4j-core</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
||||
@@ -0,0 +1,15 @@
|
||||
package ch.gtache.fro.modules.vogelwarte;
|
||||
|
||||
import ch.gtache.fro.Fetcher;
|
||||
import ch.gtache.fro.vogelwarte.VogelwarteFetcher;
|
||||
import dagger.Binds;
|
||||
import dagger.Module;
|
||||
import dagger.multibindings.IntoSet;
|
||||
|
||||
@Module
|
||||
public abstract class VogelwarteModule {
|
||||
|
||||
@Binds
|
||||
@IntoSet
|
||||
abstract Fetcher bindsFetcher(VogelwarteFetcher fetcher);
|
||||
}
|
||||
@@ -0,0 +1,188 @@
|
||||
package ch.gtache.fro.vogelwarte;
|
||||
|
||||
import ch.gtache.fro.Bird;
|
||||
import ch.gtache.fro.BirdProvider;
|
||||
import ch.gtache.fro.Configuration;
|
||||
import ch.gtache.fro.FetchException;
|
||||
import ch.gtache.fro.Fetcher;
|
||||
import ch.gtache.fro.PictureType;
|
||||
import ch.gtache.fro.SoundType;
|
||||
import ch.gtache.fro.selenium.AbstractSeleniumFetcher;
|
||||
import jakarta.inject.Inject;
|
||||
import jakarta.inject.Singleton;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
import org.jsoup.HttpStatusException;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.openqa.selenium.By;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Locale;
|
||||
import java.util.SequencedCollection;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Implementation of {@link Fetcher} for vogelwarte.ch
|
||||
*/
|
||||
@Singleton
|
||||
public class VogelwarteFetcher extends AbstractSeleniumFetcher {
|
||||
private static final Logger logger = LogManager.getLogger(VogelwarteFetcher.class);
|
||||
|
||||
private static final String BASE_URL = "https://www.vogelwarte.ch";
|
||||
private static final Pattern IMAGE_PATTERN = Pattern.compile("https://www\\.vogelwarte\\.ch/wp-content/(?:assets/images/bird/species|uploads/\\d{4}/\\d{2})/(?<id>\\d+(?:_\\d+)?)\\.jpg");
|
||||
|
||||
@Inject
|
||||
VogelwarteFetcher(final BirdProvider birdProvider, final Configuration configuration) {
|
||||
super(birdProvider, configuration);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void waitFor() throws IOException {
|
||||
while (driver().findElements(By.cssSelector("main img")).isEmpty()) {
|
||||
try {
|
||||
Thread.sleep(100L);
|
||||
} catch (final InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
throw new IOException(e);
|
||||
}
|
||||
}
|
||||
try {
|
||||
Thread.sleep(1000L);
|
||||
} catch (final InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
throw new IOException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String name() {
|
||||
return "Vogelwarte";
|
||||
}
|
||||
|
||||
private static String getSpecialBirdName(final String birdName) {
|
||||
return switch (birdName) {
|
||||
case "mesange-boreale" -> "mesange-boreale-alpestre-ou-des-saules";
|
||||
default -> birdName;
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void download(final Bird bird) throws FetchException {
|
||||
final var birdName = getSpecialBirdName(bird.name().toLowerCase(Locale.ROOT).replace("_", "-"));
|
||||
final var url = BASE_URL + "/fr/les-oiseaux-de-suisse/" + birdName;
|
||||
|
||||
logger.info("Trying to download {}", url);
|
||||
try {
|
||||
final var document = getDocument(url);
|
||||
final var seen = new HashSet<String>();
|
||||
final var ogImages = document.select("meta[property=og:image]");
|
||||
final var id = getId(ogImages);
|
||||
saveImages(bird, document, id, seen);
|
||||
savePreloadImages(bird, document, id, seen);
|
||||
saveOGImages(bird, document, id, seen);
|
||||
downloadSound(bird, id);
|
||||
} catch (final IOException e) {
|
||||
throw new FetchException("Failed to parse and download " + url, e);
|
||||
}
|
||||
}
|
||||
|
||||
private void saveImages(final Bird bird, final Document document, final String id,
|
||||
final Set<? super String> seen) throws IOException {
|
||||
final var images = document.select("main img");
|
||||
for (final var image : images) {
|
||||
saveImage(bird, image, id, seen);
|
||||
}
|
||||
}
|
||||
|
||||
private void saveImage(final Bird bird, final Element image, final String id,
|
||||
final Set<? super String> seen) throws IOException {
|
||||
final var src = image.attr("src");
|
||||
final var imageLocation = BASE_URL + src;
|
||||
final var matcher = IMAGE_PATTERN.matcher(imageLocation);
|
||||
if (matcher.matches()) {
|
||||
final var imageId = matcher.group("id");
|
||||
if (imageId.startsWith(id) && seen.add(imageId)) {
|
||||
final var alt = image.attr("alt");
|
||||
final var pictureType = alt.startsWith("sous-espèce") ? PictureType.SUB_SPECIES : switch (alt) {
|
||||
case "adulte" -> PictureType.ADULT;
|
||||
case "plumage juvénile" -> PictureType.JUVENILE;
|
||||
case "plumage nuptial" -> PictureType.ADULT_NUPTIAL;
|
||||
case "plumage internuptial" -> PictureType.ADULT_INTERNUPTIAL;
|
||||
case "mâle" -> PictureType.ADULT_MALE;
|
||||
case "mâle nuptial" -> PictureType.ADULT_NUPTIAL_MALE;
|
||||
case "mâle internuptial" -> PictureType.ADULT_INTERNUPTIAL_MALE;
|
||||
case "femelle" -> PictureType.ADULT_FEMALE;
|
||||
case "forme claire" -> PictureType.LIGHT_MORPH;
|
||||
case "forme sombre" -> PictureType.DARK_MORPH;
|
||||
default -> PictureType.UNKNOWN;
|
||||
};
|
||||
downloadImage(bird, imageLocation, pictureType);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void saveOGImages(final Bird bird, final Document document, final String id,
|
||||
final Set<? super String> seen) throws IOException {
|
||||
final var ogImages = document.select("meta[property=og:image]");
|
||||
for (final var ogImage : ogImages) {
|
||||
saveOGImage(bird, ogImage, id, seen);
|
||||
}
|
||||
}
|
||||
|
||||
private void savePreloadImages(final Bird bird, final Document document, final String id,
|
||||
final Set<? super String> seen) throws IOException {
|
||||
final var preloadImages = document.select("link[rel=preload]");
|
||||
for (final var preloadImage : preloadImages) {
|
||||
savePreloadImage(bird, preloadImage, id, seen);
|
||||
}
|
||||
}
|
||||
|
||||
private void savePreloadImage(final Bird bird, final Element preloadImage, final String id,
|
||||
final Set<? super String> seen) throws IOException {
|
||||
final var href = preloadImage.attr("href");
|
||||
final var matcher = IMAGE_PATTERN.matcher(href);
|
||||
if (matcher.matches()) {
|
||||
final var imageId = matcher.group("id");
|
||||
if (imageId.startsWith(id) && seen.add(imageId)) {
|
||||
downloadImage(bird, href, PictureType.UNKNOWN);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void saveOGImage(final Bird bird, final Element ogImage, final String id,
|
||||
final Set<? super String> seen) throws IOException {
|
||||
final var src = ogImage.attr("content");
|
||||
final var matcher = IMAGE_PATTERN.matcher(src);
|
||||
if (matcher.matches()) {
|
||||
final var imageId = matcher.group("id");
|
||||
if (imageId.startsWith(id) && seen.add(imageId)) {
|
||||
downloadImage(bird, src, PictureType.UNKNOWN);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void downloadSound(final Bird bird, final CharSequence id) throws IOException {
|
||||
final var url = BASE_URL + "/wp-content/assets/media/voices/" + id + ".mp3";
|
||||
logger.info("Trying to download {}", url);
|
||||
try {
|
||||
downloadSound(bird, url, SoundType.SONG);
|
||||
} catch (final HttpStatusException _) {
|
||||
final var fallbackId = "0".repeat(4 - id.length()) + id;
|
||||
if (!fallbackId.contentEquals(id)) {
|
||||
downloadSound(bird, fallbackId);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static String getId(final SequencedCollection<? extends Element> images) {
|
||||
final var image = images.getFirst();
|
||||
final var src = image.attr("content");
|
||||
final var split = src.split("/");
|
||||
final var lastSegment = split[split.length - 1];
|
||||
final var fullId = lastSegment.split("\\.")[0];
|
||||
return fullId.split("_")[0];
|
||||
}
|
||||
}
|
||||
14
vogelwarte/src/main/java/module-info.java
Normal file
14
vogelwarte/src/main/java/module-info.java
Normal file
@@ -0,0 +1,14 @@
|
||||
/**
|
||||
* Module for vogelwarte.ch fetcher
|
||||
*/
|
||||
module ch.gtache.fro.vogelwarte {
|
||||
requires transitive ch.gtache.fro.selenium;
|
||||
requires org.jsoup;
|
||||
requires java.net.http;
|
||||
requires jakarta.inject;
|
||||
requires org.apache.logging.log4j;
|
||||
requires dagger;
|
||||
requires java.compiler;
|
||||
requires org.seleniumhq.selenium.api;
|
||||
exports ch.gtache.fro.vogelwarte;
|
||||
}
|
||||
@@ -0,0 +1,8 @@
|
||||
package ch.gtache.fro.vogelwarte;
|
||||
|
||||
final class TestVogelwarteFetcher {
|
||||
|
||||
private TestVogelwarteFetcher() {
|
||||
}
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user