StaticHtmlFetcherAdapter.java

package com.wilzwert.myjobs.infrastructure.adapter.fetcher;


import com.wilzwert.myjobs.core.domain.shared.ports.driven.fetcher.StaticHtmlFetcher;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Component;
import org.jsoup.Jsoup;
import org.jsoup.Connection;
import org.jsoup.nodes.Document;

import java.io.IOException;
import java.util.Map;
import java.util.Optional;

/**
 * @author Wilhelm Zwertvaegher
 */
@Component
@Slf4j
public class StaticHtmlFetcherAdapter implements StaticHtmlFetcher {
    // FIXME : this is not a very clean way of setting headers
    private static final Map<String, String> HEADERS = Map.of(
        "User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0",
        "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
        "Accept-Language", "en-US,en;q=0.5",
        "Accept-Encoding", "gzip, deflate",
        "Connection", "keep-alive",
        "Upgrade-Insecure-Requests", "1",
        "Sec-Fetch-Dest", "document",
        "Sec-Fetch-Mode", "navigate",
        "Sec-Fetch-Site", "none",
        "Sec-Fetch-User", "?1"
    );
    private static final Map<String, String> OTHER_HEADERS = Map.of(
    "Cache-Control", "max-age=0"
    );

    @Override
    public Optional<String> fetchHtml(String url) {
        log.info("Fetching HTML from {}", url);

        try {
            Connection connection = Jsoup
                    .connect(url)
                    .ignoreContentType(true)
                    .referrer(url);

            HEADERS.forEach(connection::header);
            OTHER_HEADERS.forEach(connection::header);
            Document document = connection
                    .get();
            log.info("got html {}", document.html());
            return Optional.of(document.html());
        }
        catch (IOException e) {
            log.error(e.getMessage(), e);
            return Optional.empty();
        }
    }
}