JsonLdJobMetadataExtractor.java

package com.wilzwert.myjobs.core.domain.model.job.ports.driven.extractor.impl;


import java.util.List;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.fasterxml.jackson.jr.ob.JSON;
import com.wilzwert.myjobs.core.domain.model.job.JobMetadata;
import com.wilzwert.myjobs.core.domain.model.job.jsonld.JobPosting;
import com.wilzwert.myjobs.core.domain.model.job.ports.driven.extractor.JobMetadataExtractor;


/**
 * @author Wilhelm Zwertvaegher
 */
public class JsonLdJobMetadataExtractor implements JobMetadataExtractor {

    private static final List<String> NOT_COMPATIBLE_DOMAINS = List.of("fhf.fr");

    private static final Pattern JSON_LD_PATTERN = Pattern.compile(
            "<script\\s+type=[\"']application/ld\\+json[\"'][^>]*>((?:(?!</script>).)*?\"@type\"\\s*:\\s*\"JobPosting\"(?:(?!</script>).)*?)</script>",
            Pattern.DOTALL | Pattern.MULTILINE
    );

    private JobMetadata buildExtractedMetadataFromJobPosting(JobPosting jobPosting) {
        JobMetadata.Builder builder = new JobMetadata.Builder();
        builder.title(jobPosting.title());
        builder.description(jobPosting.description());
        var organization = jobPosting.hiringOrganization();
        if(organization != null) {
            builder.company(organization.name());
        }

        if(jobPosting.qualifications() != null) {
            builder.profile(jobPosting.qualifications());
        }
        else if(jobPosting.experienceRequirements() != null) {
            builder.profile(jobPosting.experienceRequirements());
        }
        builder.url(jobPosting.url());
        builder.salary(jobPosting.computeSalary());
        return builder.build();
    }


    @Override
    public Optional<JobMetadata> extractJobMetadata(String html) {
        if(html == null || html.isEmpty()) {
            return Optional.empty();
        }

        Matcher matcher = JSON_LD_PATTERN.matcher(html);
        try {
            while(matcher.find()) {
                if(matcher.group(1) != null) {
                    JobPosting jobPosting = JSON.std.beanFrom(JobPosting.class, matcher.group(1));
                    return Optional.of(buildExtractedMetadataFromJobPosting(jobPosting));
                }
            }
            return Optional.empty();
        } catch (Exception e) {
            return Optional.empty();
        }
    }

    @Override
    public boolean isCompatible(String domain) {
        return NOT_COMPATIBLE_DOMAINS.stream().noneMatch(domain::matches);
    }
}