Initial commit
This commit is contained in:
24
oiseaux-net/pom.xml
Normal file
24
oiseaux-net/pom.xml
Normal file
@@ -0,0 +1,24 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<groupId>ch.gtache.fro</groupId>
|
||||
<artifactId>fro</artifactId>
|
||||
<version>1.0.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>fro-oiseaux-net</artifactId>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>ch.gtache.fro</groupId>
|
||||
<artifactId>fro-selenium</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.dagger</groupId>
|
||||
<artifactId>dagger</artifactId>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
||||
@@ -0,0 +1,76 @@
|
||||
package ch.gtache.fro.oiseaux.net;
|
||||
|
||||
import ch.gtache.fro.impl.CommonBirds;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.openqa.selenium.chrome.ChromeDriver;
|
||||
import org.openqa.selenium.chrome.ChromeOptions;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.util.Arrays;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import static ch.gtache.fro.oiseaux.net.Utils.getName;
|
||||
|
||||
/**
|
||||
* Fetches the translations for all bird from oiseaux.net
|
||||
*/
|
||||
final class BirdTranslationsFetcher {
|
||||
|
||||
private static final Pattern END_COMMA_PATTERN = Pattern.compile("\\s*,\\s*$");
|
||||
|
||||
private BirdTranslationsFetcher() {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Main method
|
||||
*
|
||||
* @param args command line arguments
|
||||
*/
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final var options = new ChromeOptions();
|
||||
options.addArguments("--headless=new");
|
||||
final var driver = new ChromeDriver(options);
|
||||
for (final var value : CommonBirds.values()) {
|
||||
final var name = getName(value);
|
||||
final var url = "https://www.oiseaux.net/oiseaux/" + name + ".html";
|
||||
driver.get(url);
|
||||
final var html = (String) driver.executeScript("return document.documentElement.outerHTML");
|
||||
final var document = Jsoup.parse(html);
|
||||
final var title = document.select("span[itemprop=headline]");
|
||||
final var elements = document.select("span.flag");
|
||||
if (title.isEmpty() || elements.isEmpty()) {
|
||||
System.out.println(value + " NOT FOUND!");
|
||||
} else {
|
||||
createOrAppend("fr", value.name(), title.text());
|
||||
for (final var element : elements) {
|
||||
final var clazz = element.attr("class");
|
||||
final var split = clazz.split(" ");
|
||||
final var language = Arrays.stream(split).filter(s -> s.startsWith("flag-")).map(s -> s.split("-")[1]).findFirst().orElse(null);
|
||||
if (language == null) {
|
||||
System.out.println("Language not found in " + element);
|
||||
} else {
|
||||
final var next = element.nextElementSibling();
|
||||
final var text = END_COMMA_PATTERN.matcher(next.text()).replaceAll("");
|
||||
createOrAppend(language, value.name(), text);
|
||||
}
|
||||
}
|
||||
}
|
||||
Thread.sleep(1000);
|
||||
}
|
||||
}
|
||||
|
||||
private static void createOrAppend(final String language, final String birdName, final String translation) throws IOException {
|
||||
final var file = Path.of("BirdBundle_" + language + ".properties");
|
||||
final var toWrite = "bird." + birdName + ".label=" + translation + "\n";
|
||||
if (Files.exists(file)) {
|
||||
Files.writeString(file, toWrite, StandardCharsets.UTF_8, StandardOpenOption.APPEND);
|
||||
} else {
|
||||
Files.writeString(file, toWrite, StandardCharsets.UTF_8, StandardOpenOption.CREATE);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,217 @@
|
||||
package ch.gtache.fro.oiseaux.net;
|
||||
|
||||
import ch.gtache.fro.Bird;
|
||||
import ch.gtache.fro.BirdProvider;
|
||||
import ch.gtache.fro.Configuration;
|
||||
import ch.gtache.fro.FetchException;
|
||||
import ch.gtache.fro.Fetcher;
|
||||
import ch.gtache.fro.PictureType;
|
||||
import ch.gtache.fro.SoundType;
|
||||
import ch.gtache.fro.impl.CommonBirds;
|
||||
import ch.gtache.fro.selenium.AbstractSeleniumFetcher;
|
||||
import jakarta.inject.Inject;
|
||||
import jakarta.inject.Singleton;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
import org.jsoup.HttpStatusException;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Implementation of {@link Fetcher} for oiseaux.net
|
||||
*/
|
||||
@Singleton
|
||||
public class OiseauxNetFetcher extends AbstractSeleniumFetcher {
|
||||
private static final Logger logger = LogManager.getLogger(OiseauxNetFetcher.class);
|
||||
|
||||
private static final Pattern SOUND_PATTERN = Pattern.compile("https://www.xeno-canto.org/(?<id>\\d+)/download");
|
||||
|
||||
|
||||
/**
|
||||
* Instantiates the fetcher
|
||||
*
|
||||
* @param birdProvider The bird provider
|
||||
* @param configuration The configuration
|
||||
* @throws NullPointerException If any parameter is null
|
||||
*/
|
||||
@Inject
|
||||
OiseauxNetFetcher(final BirdProvider birdProvider, final Configuration configuration) {
|
||||
super(birdProvider, configuration);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void waitFor() throws IOException {
|
||||
try {
|
||||
Thread.sleep(500L);
|
||||
} catch (final InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
throw new IOException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String name() {
|
||||
return "oiseaux.net";
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void download(final Bird bird) throws FetchException {
|
||||
if (bird != CommonBirds.MESANGE_ALPESTRE && bird != CommonBirds.MESANGE_DES_SAULES) {
|
||||
downloadImages(bird);
|
||||
downloadSounds(bird);
|
||||
}
|
||||
}
|
||||
|
||||
private void downloadImages(final Bird bird) throws FetchException {
|
||||
final var url = getImagesURL(bird);
|
||||
logger.info("Trying to download {}", url);
|
||||
try {
|
||||
final var document = getDocument(url);
|
||||
final var figures = document.select("figure");
|
||||
for (final var figure : figures) {
|
||||
handleFigure(bird, url, figure);
|
||||
}
|
||||
} catch (final IOException e) {
|
||||
throw new FetchException("Failed to parse and download " + url, e);
|
||||
}
|
||||
}
|
||||
|
||||
private void downloadSounds(final Bird bird) throws FetchException {
|
||||
try {
|
||||
final var jsUrl = getJSUrl(bird);
|
||||
final var content = Jsoup.connect(jsUrl).ignoreContentType(true).execute().body();
|
||||
final var url = getURL(bird);
|
||||
final var pageContent = getDocument(url);
|
||||
final var soundTypes = getSoundTypes(pageContent);
|
||||
final var matcher = SOUND_PATTERN.matcher(content);
|
||||
var i = 0;
|
||||
while (matcher.find()) {
|
||||
final var soundUrl = matcher.group();
|
||||
final var soundType = i >= soundTypes.size() ? SoundType.UNKNOWN : soundTypes.get(i);
|
||||
try {
|
||||
downloadSound(bird, soundUrl, soundType);
|
||||
} catch (final HttpStatusException e) {
|
||||
if (e.getStatusCode() == 404) {
|
||||
logger.warn("Sound {} not found", soundUrl);
|
||||
} else {
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
i++;
|
||||
}
|
||||
if (i != soundTypes.size()) {
|
||||
logger.warn("Number of sounds ({}) does not match number of sound types ({})", i, soundTypes.size());
|
||||
}
|
||||
} catch (final HttpStatusException e) {
|
||||
if (e.getStatusCode() == 404) {
|
||||
logger.warn("Sound for bird {} not found", bird);
|
||||
} else {
|
||||
throw new FetchException(e);
|
||||
}
|
||||
} catch (final IOException e) {
|
||||
throw new FetchException("Failed to parse and download " + getJSUrl(bird), e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String getSoundFilename(final String url) {
|
||||
final var matcher = SOUND_PATTERN.matcher(url);
|
||||
if (matcher.find()) {
|
||||
return matcher.group("id") + ".mp3";
|
||||
}
|
||||
return super.getSoundFilename(url);
|
||||
}
|
||||
|
||||
private static List<SoundType> getSoundTypes(final Document pageContent) {
|
||||
final var sounds = pageContent.select("a[class=jp-playlist-item],a[class=jp-playlist-item jp-playlist-current]");
|
||||
final var soundTypes = new ArrayList<SoundType>();
|
||||
for (final var sound : sounds) {
|
||||
if (!sound.text().isBlank()) {
|
||||
final var soundType = getSoundType(sound.text());
|
||||
soundTypes.add(soundType);
|
||||
}
|
||||
}
|
||||
return soundTypes;
|
||||
}
|
||||
|
||||
private static String getURL(final Bird bird) {
|
||||
final var name = Utils.getName(bird);
|
||||
return "https://www.oiseaux.net/oiseaux/" + name + ".html";
|
||||
}
|
||||
|
||||
private static String getJSUrl(final Bird bird) {
|
||||
final var name = Utils.getName(bird);
|
||||
return "https://www.oiseaux.net/front/js/espece/" + name + ".js";
|
||||
}
|
||||
|
||||
private void handleFigure(final Bird bird, final String url, final Element figure) throws IOException {
|
||||
final var link = figure.selectFirst("a");
|
||||
if (link != null) {
|
||||
final var img = link.selectFirst("img");
|
||||
if (img != null) {
|
||||
final var type = getPictureType(figure);
|
||||
downloadImage(bird, url, img, type);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void downloadImage(final Bird bird, final String url, final Element img, final PictureType pictureType) throws IOException {
|
||||
final var src = img.attr("data-src");
|
||||
final var lastSlash = url.lastIndexOf('/');
|
||||
final var imageUrl = url.substring(0, lastSlash) + addLeadingSlash(src).replace("/id/", "/");
|
||||
downloadImage(bird, imageUrl, pictureType);
|
||||
}
|
||||
|
||||
private static String addLeadingSlash(final String src) {
|
||||
if (src.startsWith("/")) {
|
||||
return src;
|
||||
} else {
|
||||
return "/" + src;
|
||||
}
|
||||
}
|
||||
|
||||
private static SoundType getSoundType(final String name) {
|
||||
return switch (name.trim().toLowerCase(Locale.ROOT)) {
|
||||
case "♫ chant" -> SoundType.SONG;
|
||||
case "♫ cri" -> SoundType.CALL;
|
||||
case "♫ cri en vol" -> SoundType.FLY_CALL;
|
||||
default -> SoundType.UNKNOWN;
|
||||
};
|
||||
}
|
||||
|
||||
private static PictureType getPictureType(final Element figure) {
|
||||
final var caption = figure.nextElementSibling();
|
||||
if (caption != null && caption.tagName().equals("figcaption")) {
|
||||
final var typeStr = caption.text();
|
||||
return getPictureType(typeStr);
|
||||
} else {
|
||||
return PictureType.UNKNOWN;
|
||||
}
|
||||
}
|
||||
|
||||
private static String getImagesURL(final Bird bird) {
|
||||
final var name = Utils.getName(bird);
|
||||
return "https://www.oiseaux.net/oiseaux/photos/" + name + ".html";
|
||||
}
|
||||
|
||||
private static PictureType getPictureType(final String name) {
|
||||
return switch (name.toLowerCase(Locale.ROOT)) {
|
||||
case "adulte" -> PictureType.ADULT;
|
||||
case "adulte plum. nuptial" -> PictureType.ADULT_NUPTIAL;
|
||||
case "adulte plum. internuptial" -> PictureType.ADULT_INTERNUPTIAL;
|
||||
case "juvénile", "poussin", "subadulte", "1ère année", "immature" -> PictureType.JUVENILE;
|
||||
case "♂ adulte plum. nuptial" -> PictureType.ADULT_NUPTIAL_MALE;
|
||||
case "♂ adulte plum. internuptial" -> PictureType.ADULT_INTERNUPTIAL_MALE;
|
||||
case "♂ adulte" -> PictureType.ADULT_MALE;
|
||||
case "♀ adulte" -> PictureType.ADULT_FEMALE;
|
||||
default -> PictureType.UNKNOWN;
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,46 @@
|
||||
package ch.gtache.fro.oiseaux.net;
|
||||
|
||||
import ch.gtache.fro.Bird;
|
||||
|
||||
import java.util.Locale;
|
||||
|
||||
final class Utils {
|
||||
|
||||
private Utils() {
|
||||
|
||||
}
|
||||
|
||||
static String getName(final Bird bird) {
|
||||
return correctName(bird.name().replace("_", ".").toLowerCase(Locale.ROOT));
|
||||
}
|
||||
|
||||
static String correctName(final String name) {
|
||||
return switch (name) {
|
||||
case "bec.croise.des.sapins" -> "bec-croise.des.sapins";
|
||||
case "busard.saint.martin" -> "busard.saint-martin";
|
||||
case "chevalier.culblanc" -> "chevalier.cul-blanc";
|
||||
case "circaete.jean.le.blanc" -> "circaete.jean-le-blanc";
|
||||
case "gallinule.poule.d.eau" -> "gallinule.poule-d.eau";
|
||||
case "gobemouche.a.demi.collier" -> "gobemouche.a.demi-collier";
|
||||
case "grand.duc.d.europe" -> "grand-duc.d.europe";
|
||||
case "grosbec.casse.noyaux" -> "grosbec.casse-noyaux";
|
||||
case "heron.garde.boeufs" -> "heron.garde-boeufs";
|
||||
case "hibou.moyen.duc" -> "hibou.moyen-duc";
|
||||
case "martin.pecheur.d.europe" -> "martin-pecheur.d.europe";
|
||||
case "mesange.a.longue.queue" -> "orite.a.longue.queue";
|
||||
//case "mesange.alpestre" -> "mesange.boreale";
|
||||
//case "mesange.des.saules" -> "mesange.boreale";
|
||||
case "petit.duc.scops" -> "petit-duc.scops";
|
||||
case "pie.grieche.a.poitrine.rose" -> "pie-grieche.a.poitrine.rose";
|
||||
case "pie.grieche.a.tete.rousse" -> "pie-grieche.a.tete.rousse";
|
||||
case "pie.grieche.ecorcheur" -> "pie-grieche.ecorcheur";
|
||||
case "pie.grieche.grise" -> "pie-grieche.grise";
|
||||
case "pie.grieche.isabelle" -> "pie-grieche.isabelle";
|
||||
case "pigeon.biset.domestique" -> "pigeon.biset";
|
||||
case "puffin.cendre" -> "puffin.boreal";
|
||||
case "roitelet.a.triple.bandeau" -> "roitelet.triple-bandeau";
|
||||
case "vautour.percnoptere" -> "percnoptere.d.egypte";
|
||||
default -> name;
|
||||
};
|
||||
}
|
||||
}
|
||||
13
oiseaux-net/src/main/java/module-info.java
Normal file
13
oiseaux-net/src/main/java/module-info.java
Normal file
@@ -0,0 +1,13 @@
|
||||
/**
|
||||
* Module for oiseaux.net fetcher
|
||||
*/
|
||||
module ch.gtache.fro.oiseaux.net {
|
||||
requires transitive ch.gtache.fro.selenium;
|
||||
requires org.seleniumhq.selenium.chrome_driver;
|
||||
requires org.jsoup;
|
||||
requires jakarta.inject;
|
||||
requires java.compiler;
|
||||
requires org.apache.logging.log4j;
|
||||
|
||||
exports ch.gtache.fro.oiseaux.net;
|
||||
}
|
||||
@@ -0,0 +1,18 @@
|
||||
package ch.gtache.fro.oiseaux.net;
|
||||
|
||||
import ch.gtache.fro.FetchException;
|
||||
import ch.gtache.fro.impl.CommonBirdsProvider;
|
||||
import ch.gtache.fro.impl.ConfigurationImpl;
|
||||
|
||||
import static org.mockito.Mockito.*;
|
||||
|
||||
class TestOiseauxNetFetcher {
|
||||
|
||||
public static void main(final String[] args) throws FetchException {
|
||||
final var provider = spy(CommonBirdsProvider.class);
|
||||
final var configuration = mock(ConfigurationImpl.class, withSettings().useConstructor(provider).defaultAnswer(CALLS_REAL_METHODS));
|
||||
try (final var fetcher = new OiseauxNetFetcher(provider, configuration)) {
|
||||
System.out.println(fetcher.fetchAll());
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user