/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.parse;

import java.io.File;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import org.apache.commons.lang3.time.StopWatch;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.TaskInputOutputContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.SignatureFactory;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metrics.ErrorTracker;
import org.apache.nutch.metrics.LatencyTracker;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseOutputFormat;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.segment.SegmentChecker;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.NutchTool;
import org.apache.nutch.util.StringUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class ParseSegment
extends NutchTool
implements Tool {
    private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
    public static final String SKIP_TRUNCATED = "parser.skip.truncated";

    public ParseSegment() {
        this(null);
    }

    public ParseSegment(Configuration conf) {
        super(conf);
    }

    public static boolean isTruncated(Content content) {
        int inHeaderSize;
        byte[] contentBytes = content.getContent();
        if (contentBytes == null) {
            return false;
        }
        Metadata metadata = content.getMetadata();
        if (metadata == null) {
            return false;
        }
        if (metadata.get("http.content.truncated") != null) {
            if ("true".equals(metadata.get("http.content.truncated"))) {
                LOG.info("{} skipped. Protocol metadata indicates truncated content, actualSize= {}", (Object)content.getUrl(), (Object)content.getContent().length);
                return true;
            }
            return false;
        }
        String lengthStr = metadata.get("Content-Length");
        if (lengthStr != null) {
            lengthStr = lengthStr.trim();
        }
        if (StringUtil.isEmpty(lengthStr)) {
            return false;
        }
        String url = content.getUrl();
        try {
            inHeaderSize = Integer.parseInt(lengthStr);
        }
        catch (NumberFormatException e) {
            LOG.warn("Wrong contentlength format for {}", (Object)url, (Object)e);
            return false;
        }
        int actualSize = contentBytes.length;
        if (inHeaderSize > actualSize) {
            LOG.info("{} skipped. Content of size {} was truncated to {}", new Object[]{url, inHeaderSize, actualSize});
            return true;
        }
        LOG.debug("{} actualSize={} inHeaderSize={}", new Object[]{url, actualSize, inHeaderSize});
        return false;
    }

    public void parse(Path segment) throws IOException, InterruptedException, ClassNotFoundException {
        if (SegmentChecker.isParsed(segment, segment.getFileSystem(this.getConf()))) {
            LOG.warn("Segment: {} already parsed!! Skipped parsing this segment!!", (Object)segment);
            return;
        }
        StopWatch stopWatch = new StopWatch();
        stopWatch.start();
        LOG.info("ParseSegment: starting");
        LOG.info("ParseSegment: segment: {}", (Object)segment);
        Job job = Job.getInstance((Configuration)this.getConf(), (String)("Nutch ParseSegment: " + String.valueOf(segment)));
        Configuration conf = job.getConfiguration();
        FileInputFormat.addInputPath((Job)job, (Path)new Path(segment, "content"));
        conf.set("nutch.segment.name", segment.getName());
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setJarByClass(ParseSegment.class);
        job.setMapperClass(ParseSegmentMapper.class);
        job.setReducerClass(ParseSegmentReducer.class);
        FileOutputFormat.setOutputPath((Job)job, (Path)segment);
        job.setOutputFormatClass(ParseOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(ParseImpl.class);
        try {
            boolean success = job.waitForCompletion(true);
            if (!success) {
                String message = NutchJob.getJobFailureLogMessage("Parse", job);
                LOG.error(message);
                throw new RuntimeException(message);
            }
        }
        catch (IOException | ClassNotFoundException | InterruptedException e) {
            LOG.error(StringUtils.stringifyException((Throwable)e));
            throw e;
        }
        stopWatch.stop();
        LOG.info("ParseSegment: finished, elapsed: {} ms", (Object)stopWatch.getTime(TimeUnit.MILLISECONDS));
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run((Configuration)NutchConfiguration.create(), (Tool)new ParseSegment(), (String[])args);
        System.exit(res);
    }

    public int run(String[] args) throws Exception {
        String usage = "Usage: ParseSegment segment [-noFilter] [-noNormalize]";
        if (args.length == 0) {
            System.err.println(usage);
            System.exit(-1);
        }
        if (args.length > 1) {
            for (int i = 1; i < args.length; ++i) {
                String param = args[i];
                if ("-nofilter".equalsIgnoreCase(param)) {
                    this.getConf().setBoolean("parse.filter.urls", false);
                    continue;
                }
                if (!"-nonormalize".equalsIgnoreCase(param)) continue;
                this.getConf().setBoolean("parse.normalize.urls", false);
            }
        }
        Path segment = new Path(args[0]);
        this.parse(segment);
        return 0;
    }

    @Override
    public Map<String, Object> run(Map<String, Object> args, String crawlId) throws Exception {
        HashMap<String, Object> results = new HashMap<String, Object>();
        Path segment = null;
        if (args.containsKey("segment")) {
            Object seg = args.get("segment");
            if (seg instanceof Path) {
                segment = (Path)seg;
            } else if (seg instanceof String) {
                segment = new Path(seg.toString());
            }
        } else {
            String segment_dir = crawlId + "/segments";
            File segmentsDir = new File(segment_dir);
            File[] segmentsList = segmentsDir.listFiles();
            Arrays.sort(segmentsList, (f1, f2) -> {
                if (f1.lastModified() > f2.lastModified()) {
                    return -1;
                }
                return 0;
            });
            segment = new Path(segmentsList[0].getPath());
        }
        if (args.containsKey("nofilter")) {
            this.getConf().setBoolean("parse.filter.urls", false);
        }
        if (args.containsKey("nonormalize")) {
            this.getConf().setBoolean("parse.normalize.urls", false);
        }
        this.parse(segment);
        results.put("result", Integer.toString(0));
        return results;
    }

    public static class ParseSegmentReducer
    extends Reducer<Text, Writable, Text, Writable> {
        public void reduce(Text key, Iterable<Writable> values, Reducer.Context context) throws IOException, InterruptedException {
            Iterator<Writable> valuesIter = values.iterator();
            context.write((Object)key, (Object)valuesIter.next());
        }
    }

    public static class ParseSegmentMapper
    extends Mapper<WritableComparable<?>, Content, Text, ParseImpl> {
        private ParseUtil parseUtil;
        private Text newKey = new Text();
        private ScoringFilters scfilters;
        private boolean skipTruncated;
        private LatencyTracker parseLatencyTracker;
        private ErrorTracker errorTracker;

        public void setup(Mapper.Context context) {
            Configuration conf = context.getConfiguration();
            this.scfilters = new ScoringFilters(conf);
            this.skipTruncated = conf.getBoolean(ParseSegment.SKIP_TRUNCATED, true);
            this.parseLatencyTracker = new LatencyTracker("nutch_parser", "parse_latency");
            this.errorTracker = new ErrorTracker("nutch_parser", (TaskInputOutputContext<?, ?, ?, ?>)context);
        }

        public void cleanup(Mapper.Context context) throws IOException, InterruptedException {
            this.parseLatencyTracker.emitCounters((TaskInputOutputContext<?, ?, ?, ?>)context);
        }

        public void map(WritableComparable<?> key, Content content, Mapper.Context context) throws IOException, InterruptedException {
            String fetchStatus;
            if (key instanceof Text) {
                this.newKey.set(key.toString());
                key = this.newKey;
            }
            if ((fetchStatus = content.getMetadata().get("_fst_")) == null) {
                LOG.debug("Skipping {} as content has no fetch status", (Object)key);
                return;
            }
            if (Integer.parseInt(fetchStatus) != 33) {
                LOG.debug("Skipping {} as content is not fetched successfully", (Object)key);
                return;
            }
            if (this.skipTruncated && ParseSegment.isTruncated(content)) {
                return;
            }
            long start = System.currentTimeMillis();
            ParseResult parseResult = null;
            try {
                if (this.parseUtil == null) {
                    this.parseUtil = new ParseUtil(context.getConfiguration());
                }
                parseResult = this.parseUtil.parse(content);
            }
            catch (Exception e) {
                LOG.warn("Error parsing: {}: {}", (Object)key, (Object)StringUtils.stringifyException((Throwable)e));
                this.errorTracker.incrementCounters(e);
                return;
            }
            for (Map.Entry<Text, Parse> entry : parseResult) {
                Text url = entry.getKey();
                Parse parse = entry.getValue();
                ParseStatus parseStatus = parse.getData().getStatus();
                context.getCounter("nutch_parser", ParseStatus.majorCodes[parseStatus.getMajorCode()]).increment(1L);
                if (!parseStatus.isSuccess()) {
                    LOG.warn("Error parsing: {}: {}", (Object)key, (Object)parseStatus);
                    parse = parseStatus.getEmptyParse(context.getConfiguration());
                }
                parse.getData().getContentMeta().set("nutch.segment.name", context.getConfiguration().get("nutch.segment.name"));
                byte[] signature = SignatureFactory.getSignature(context.getConfiguration()).calculate(content, parse);
                parse.getData().getContentMeta().set("nutch.content.digest", StringUtil.toHexString(signature));
                try {
                    this.scfilters.passScoreAfterParsing(url, content, parse);
                }
                catch (ScoringFilterException e) {
                    LOG.warn("Error passing score: {}: {}", (Object)url, (Object)e.getMessage());
                    this.errorTracker.incrementCounters(ErrorTracker.ErrorType.SCORING);
                }
                long end = System.currentTimeMillis();
                long parseTime = end - start;
                this.parseLatencyTracker.record(parseTime);
                LOG.info("Parsed ({}ms): {}", (Object)parseTime, (Object)url);
                context.write((Object)url, (Object)new ParseImpl(new ParseText(parse.getText()), parse.getData(), parse.isCanonical()));
            }
        }
    }
}

