Commit 3cd814d8 authored by iwakeh's avatar iwakeh Committed by Karsten Loesing
Browse files

Add new descriptor type for web server access logs.

Implements task-22983 and is based on the log-descriptor
specification.
parent 5903c105
# Changes in version 2.2.0 - 2018-01-??
* Major changes
- Add new descriptor type WebServerAccessLog to parse web server
access logs.
# Changes in version 2.1.1 - 2017-10-09
* Minor changes
......
/* Copyright 2017--2018 The Tor Project
* See LICENSE for licensing information */
package org.torproject.descriptor;
import java.util.List;
/**
* Contains a log file.
*
* <p>Unlike other descriptors, logs can get very large and are typically stored
* on disk in compressed form. However, all access to log contents through this
* interface and its subinterfaces is made available in uncompressed form.</p>
*
* @since 2.2.0
*/
public interface LogDescriptor extends Descriptor {
/**
* Returns the decompressed raw descriptor bytes of the log.
*
* @since 2.2.0
*/
@Override
public byte[] getRawDescriptorBytes();
/**
* Returns annotations found in the log file, which may be an empty List if a
* log format does not support adding annotations.
*
* @since 2.2.0
*/
@Override
public List<String> getAnnotations();
/**
* Returns unrecognized lines encountered while parsing the log, which may be
* an empty list or a fixed-size list with only a few entries, depending on
* the log type.
*
* @since 2.2.0
*/
@Override
public List<String> getUnrecognizedLines();
}
/* Copyright 2017--2018 The Tor Project
* See LICENSE for licensing information */
package org.torproject.descriptor;
import java.time.LocalDate;
import java.util.List;
/**
* Contains a sanitized web server access log file from a {@code torproject.org}
* web server.
*
* <p>Parsing non-sanitized web server access logs from {@code torproject.org}
* web servers or other web servers is not explicitly supported, but may work
* anyway.</p>
*
* @since 2.2.0
*/
public interface WebServerAccessLog extends LogDescriptor {
/**
* Returns the date when requests contained in the log have been started,
* which is parsed from the log file path.
*
* <p>Typical web server access logs may contain date information in their
* file path, too, but that would be the date when the log file was rotated,
* which is not necessary the same date as the date in contained request
* lines.</p>
*
* @since 2.2.0
*/
public LocalDate getLogDate();
/**
* Returns the hostname of the physical host writing this log file, which is
* parsed from the log file path.
*
* <p>A physical host can serve multiple virtual hosts, and a virtual host can
* be served by multiple physical hosts.</p>
*
* @since 2.2.0
*/
public String getPhysicalHost();
/**
* Returns the hostname of the virtual host that this log file was written
* for, which is parsed from the log file path.
*
* <p>A physical host can serve multiple virtual hosts, and a virtual host can
* be served by multiple physical hosts.</p>
*
* @since 2.2.0
*/
public String getVirtualHost();
/**
* Returns at most three unrecognized lines encountered while parsing the log.
*
* @since 2.2.0
*/
@Override
public List<String> getUnrecognizedLines();
}
......@@ -9,6 +9,10 @@ import static org.torproject.descriptor.impl.DescriptorImpl.SP;
import org.torproject.descriptor.Descriptor;
import org.torproject.descriptor.DescriptorParseException;
import org.torproject.descriptor.DescriptorParser;
import org.torproject.descriptor.log.LogDescriptorImpl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.lang.reflect.Constructor;
......@@ -19,6 +23,9 @@ import java.util.List;
public class DescriptorParserImpl implements DescriptorParser {
private static final Logger log
= LoggerFactory.getLogger(DescriptorParserImpl.class);
@Override
public Iterable<Descriptor> parseDescriptors(byte[] rawDescriptorBytes,
File descriptorFile, String fileName) {
......@@ -26,8 +33,7 @@ public class DescriptorParserImpl implements DescriptorParser {
return this.detectTypeAndParseDescriptors(rawDescriptorBytes,
descriptorFile, fileName);
} catch (DescriptorParseException e) {
/* Looks like we attempted to parse the whole raw descriptor bytes at once
* below and ran into a parse issue. */
log.debug("Cannot parse descriptor file ’{}’.", descriptorFile, e);
List<Descriptor> parsedDescriptors = new ArrayList<>();
parsedDescriptors.add(new UnparseableDescriptorImpl(rawDescriptorBytes,
new int[] { 0, rawDescriptorBytes.length }, descriptorFile, e));
......@@ -124,6 +130,8 @@ public class DescriptorParserImpl implements DescriptorParser {
} else if (firstLines.startsWith("@type torperf 1.")) {
return TorperfResultImpl.parseTorperfResults(rawDescriptorBytes,
descriptorFile);
} else if (descriptorFile.getName().contains(LogDescriptorImpl.MARKER)) {
return LogDescriptorImpl.parse(rawDescriptorBytes, descriptorFile);
} else {
throw new DescriptorParseException("Could not detect descriptor "
+ "type in descriptor starting with '" + firstLines + "'.");
......
......@@ -2,14 +2,12 @@
* See LICENSE for licensing information */
/**
* <h1>This package is still in alpha stage.</h1>
* <p>The public interface might still change in unexpected ways.</p>
* <h1>This package is part of the implementation not the public API.</h1>
* <p>The public interface might change in unexpected ways.</p>
*
* <p>Interfaces and essential classes for obtaining and processing
* CollecTor's index.json file.</p>
*
* <p>Interfaces and classes make the content of index.json available.</p>
*
*
* @since 1.4.0
*/
......
......@@ -12,6 +12,8 @@ import org.apache.commons.compress.compressors.xz.XZCompressorOutputStream;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
......@@ -43,6 +45,8 @@ public enum FileType {
/**
* Returns <code>valueOf</code> or the default enum {@link #PLAIN}, i.e.,
* this method doesn't throw any exceptions and allways returns a valid enum.
*
* @since 2.1.0
*/
public static FileType findType(String ext) {
FileType res = null;
......@@ -54,16 +58,61 @@ public enum FileType {
}
}
/** Return the appropriate input stream. */
/**
* Return the appropriate input stream.
*
* @since 1.4.0
*/
public InputStream inputStream(InputStream is) throws Exception {
return this.inClass.getConstructor(new Class[]{InputStream.class})
.newInstance(is);
}
/** Return the appropriate output stream. */
/**
* Return the appropriate output stream.
*
* @since 1.4.0
*/
public OutputStream outputStream(OutputStream os) throws Exception {
return this.outClass.getConstructor(new Class[]{OutputStream.class})
.newInstance(os);
}
/**
* Compresses the given bytes in memory and returns the compressed bytes.
*
* @since 2.2.0
*/
public byte[] compress(byte[] bytes) throws Exception {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
try (OutputStream os = this.outputStream(baos)) {
os.write(bytes);
os.flush();
}
return baos.toByteArray();
}
/**
* Decompresses the given bytes in memory and returns the decompressed bytes.
*
* @since 2.2.0
*/
public byte[] decompress(byte[] bytes) throws Exception {
if (0 == bytes.length) {
return bytes;
}
try (InputStream is
= this.inputStream(new ByteArrayInputStream(bytes));
ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
int readByte = is.read();
while (readByte > 0) {
baos.write(readByte);
readByte = is.read();
}
baos.flush();
return baos.toByteArray();
}
}
}
......@@ -2,11 +2,13 @@
* See LICENSE for licensing information */
/**
* <h1>This package is part of the implementation not the public API.</h1>
* <p>The public interface might change in unexpected ways.</p>
* Interfaces and essential classes for obtaining and processing
* descriptors.
*
* <p><strong>This package is part of the implementation not the
* public API.</strong></p>
*
* <p>Interfaces and essential classes for obtaining and processing
* descriptors.</p>
* <p>The public interface might change in unexpected ways.</p>
*
* @since 2.1.0
*/
......
/* Copyright 2017--2018 The Tor Project
* See LICENSE for licensing information */
package org.torproject.descriptor.log;
import org.torproject.descriptor.DescriptorParseException;
import org.torproject.descriptor.LogDescriptor;
/**
* This interface provides methods for internal use only.
*
* @since 2.2.0
*/
public interface InternalLogDescriptor extends LogDescriptor {
/** Logfile name parts separator. */
public static final String SEP = "_";
/**
* Validate log lines.
*
* @since 2.2.0
*/
public void validate() throws DescriptorParseException;
/**
* Set the <code>Validator</code> that will perform the validation on log
* lines.
*
* <p>Usually set by the implementing class.</p>
*
* @since 2.2.0
*/
public void setValidator(Validator validator);
/**
* Set the descriptor's bytes.
*
* @since 2.2.0
*/
public void setRawDescriptorBytes(byte[] bytes);
/** Return the descriptor's preferred compression. */
public String getCompressionType();
/**
* Provides a single function for validating a single log line.
*
* @since 2.2.0
*/
public interface Validator {
/**
* Verifies a log line.
*
* @since 2.2.0
*/
public boolean validate(String line);
}
}
/* Copyright 2018 The Tor Project
* See LICENSE for licensing information */
package org.torproject.descriptor.log;
/**
* This interface provides methods for internal use only.
*
* @since 2.2.0
*/
public interface InternalWebServerAccessLog extends InternalLogDescriptor {
/** The log's name should include this string. */
public static final String MARKER = "access.log";
}
/* Copyright 2017--2018 The Tor Project
* See LICENSE for licensing information */
package org.torproject.descriptor.log;
import org.torproject.descriptor.Descriptor;
import org.torproject.descriptor.DescriptorParseException;
import org.torproject.descriptor.LogDescriptor;
import org.torproject.descriptor.internal.FileType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
/**
* Base class for log descriptors.
*
* @since 2.2.0
*/
public abstract class LogDescriptorImpl
implements LogDescriptor, InternalLogDescriptor {
/** The log's file name should contain this string. */
public static final String MARKER = ".log";
private static final int unrecognizedLinesLimit = 3;
private static final Logger log
= LoggerFactory.getLogger(LogDescriptorImpl.class);
private static Pattern filenamePattern = Pattern.compile(
"(?:\\S*)" + MARKER + SEP + "(?:[0-9a-zA-Z]*)(?:\\.?)([a-zA-Z2]*)");
private final File descriptorFile;
/** Byte array for plain, i.e. uncompressed, log data. */
private byte[] logBytes;
private FileType fileType;
private List<String> unrecognizedLines = new ArrayList<>();
private Validator validator = (String line) -> true;
/**
* This constructor performs basic operations on the given bytes.
*
* <p>An unknown compression type (see {@link #getCompressionType})
* is interpreted as missing compression. In this case the bytes
* will be compressed to the given compression type.</p>
*
* @since 2.2.0
*/
protected LogDescriptorImpl(byte[] logBytes, File descriptorFile,
FileType defaultCompression) throws DescriptorParseException {
this.logBytes = logBytes;
this.descriptorFile = descriptorFile;
try {
Matcher mat = filenamePattern.matcher(descriptorFile.getName());
if (!mat.find()) {
throw new DescriptorParseException(
"Log file name doesn't comply to standard: " + descriptorFile);
}
this.fileType = FileType.findType(mat.group(1).toUpperCase());
if (FileType.PLAIN == this.fileType) {
this.fileType = defaultCompression;
} else {
this.logBytes = this.fileType.decompress(this.logBytes);
}
} catch (Exception ex) {
throw new DescriptorParseException("Cannot parse file "
+ descriptorFile.getName(), ex);
}
}
@Override
public void validate() throws DescriptorParseException {
try (BufferedReader br
= new BufferedReader(new InputStreamReader(new ByteArrayInputStream(
this.logBytes)))) {
this.unrecognizedLines.addAll(br.lines().parallel().filter((line)
-> null != line && !line.isEmpty() && !validator.validate(line))
.limit(unrecognizedLinesLimit).collect(Collectors.toList()));
} catch (Exception ex) {
throw new DescriptorParseException("Cannot validate log lines.", ex);
}
}
/**
* Assemble a LogDescriptor.
*
* @since 2.2.0
*/
public static List<Descriptor> parse(byte[] logBytes,
File descriptorFile) throws DescriptorParseException {
if (descriptorFile.getName().contains(InternalWebServerAccessLog.MARKER)) {
return Arrays.asList(new Descriptor[]{
new WebServerAccessLogImpl(logBytes, descriptorFile)});
} else {
throw new DescriptorParseException("Cannot parse file "
+ descriptorFile.getName());
}
}
public static byte[] collectionToBytes(Collection<String> lines) {
return lines.stream().collect(Collectors.joining("\n", "", "\n"))
.getBytes();
}
@Override
public void setValidator(Validator validator) {
this.validator = validator;
}
@Override
public String getCompressionType() {
return this.fileType.name().toLowerCase();
}
@Override
public byte[] getRawDescriptorBytes() {
return this.logBytes;
}
@Override
public void setRawDescriptorBytes(byte[] bytes) {
this.logBytes = bytes;
}
@Override
public int getRawDescriptorLength() {
return this.logBytes.length;
}
@Override
public List<String> getAnnotations() {
return Collections.emptyList();
}
@Override
public List<String> getUnrecognizedLines() {
return this.unrecognizedLines;
}
@Override
public File getDescriptorFile() {
return descriptorFile;
}
}
/* Copyright 2017--2018 The Tor Project
* See LICENSE for licensing information */
package org.torproject.descriptor.log;
import org.torproject.descriptor.DescriptorParseException;
import org.torproject.descriptor.WebServerAccessLog;
import org.torproject.descriptor.internal.FileType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.Collection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Implementation of web server access log descriptors.
*
* <p>Defines sanitization and validation for web server access logs.</p>
*
* @since 2.2.0
*/
public class WebServerAccessLogImpl extends LogDescriptorImpl
implements InternalWebServerAccessLog, WebServerAccessLog {
private static final Logger log
= LoggerFactory.getLogger(WebServerAccessLogImpl.class);
/** The log's name should include this string. */
public static final String MARKER = InternalWebServerAccessLog.MARKER;
/** The mandatory web server log descriptor file name pattern. */
public static final Pattern filenamePattern
= Pattern.compile("(\\S*)" + SEP + "(\\S*)" + SEP + "" + MARKER
+ SEP + "(\\d*)(?:\\.?)([a-zA-Z]*)");
private final String physicalHost;
private final String virtualHost;
private final LocalDate logDate;
/**
* Creates a WebServerAccessLog from the given bytes and filename.
*
* <p>The given bytes are read, whereas the file is not read.</p>
*
* <p>The path of the given file has to be compliant to the following
* naming pattern
* {@code
* <virtualHost>-<physicalHost>-access.log-<yyyymmdd>.<compression>},
* where an unknown compression type (see {@link #getCompressionType})
* is interpreted as missing compression. In this case the bytes
* will be compressed to the default compression type.
* The immediate parent name is taken to be the physical host collecting the
* logs.</p>
*/
protected WebServerAccessLogImpl(byte[] logBytes, File file)
throws DescriptorParseException {
this(logBytes, file, FileType.XZ);