Skip to content

Commit

Permalink
Skip UTF-8 BOM mark in EncodingDetectingInputStream and default to …
Browse files Browse the repository at this point in the history
…UTF-8 in `RewriteTest` (#4546)

* Skip UTF-8 BOM mark in `EncodingDetectingInputStream`

As the `EncodingDetectingInputStream` is only used as input for the parsers, we typically don't want to see any UTF-8 BOM marker. Additionally, platforms like .NET remove the BOM mark as well, so this change brings better compatibility.

* Add test for BOM skipping

* Improve performance of `EncodingDetectingInputStream` by a lot

* Fix `JavadocPrinterTest`

* Use UTF-8 as default encoding in `RewriteTest`

* Fix bug when reading from single byte stream

* Add missing test

* Adjust `CompilationUnitTest` whitespace

* Fix handling of empty files

* Polish one more test case
  • Loading branch information
knutwannheden authored Oct 3, 2024
1 parent fa7533b commit c63ab56
Show file tree
Hide file tree
Showing 13 changed files with 252 additions and 175 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

@State(Scope.Benchmark)
Expand All @@ -38,10 +39,13 @@ public void setup() throws URISyntaxException, IOException {
throw new RuntimeException("Unable to create directory");
}

sourceFiles = new ArrayList<>(1_000);
for (int i = 0; i < 1_000; i++) {
Files.writeString(test.resolve("Test" + i + ".java"),
"package test; class Test" + i + " {}");
sourceFiles = new ArrayList<>(100);
// to add some "meat to the bones"
String whitespace = String.join("", Collections.nCopies(1_000, " "));
for (int i = 0; i < 100; i++) {
Path path = test.resolve("Test" + i + ".java");
Files.writeString(path, "package test; class Test%d {%s}".formatted(i, whitespace));
sourceFiles.add(path);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,64 +16,53 @@
package org.openrewrite.benchmarks.java;

import org.openjdk.jmh.annotations.*;
import org.openjdk.jmh.infra.Blackhole;
import org.openjdk.jmh.runner.Runner;
import org.openjdk.jmh.runner.RunnerException;
import org.openjdk.jmh.runner.options.Options;
import org.openjdk.jmh.runner.options.OptionsBuilder;
import org.openrewrite.InMemoryExecutionContext;
import org.openrewrite.Parser;
import org.openrewrite.java.JavaParser;
import org.openrewrite.tree.ParsingExecutionContextView;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.nio.file.Files;
import java.nio.charset.StandardCharsets;
import java.util.concurrent.TimeUnit;

import static java.util.stream.Collectors.toList;

@Fork(1)
@Measurement(iterations = 2)
@Warmup(iterations = 2)
@BenchmarkMode(Mode.SampleTime)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.SECONDS)
@Threads(4)
public class ParserInputBenchmark {

@Benchmark
public void readFromDisk(JavaFiles state) {
//language=java
public void detectCharset(JavaFiles state, Blackhole bh) {
JavaParser.fromJavaVersion().build()
.parseInputs(state.getSourceFiles().stream()
.map(sourceFile -> new Parser.Input(sourceFile, () -> {
try {
return Files.newInputStream(sourceFile);
} catch (IOException e) {
throw new UncheckedIOException(e);
}
})
)
.map(Parser.Input::fromFile)
.collect(toList()),
null,
new InMemoryExecutionContext());
new InMemoryExecutionContext()
)
.forEach(bh::consume);
}

@Benchmark
public void readFromDiskWithBufferedInputStream(JavaFiles state) {
//language=java
public void knownCharset(JavaFiles state, Blackhole bh) {
ParsingExecutionContextView ctx = ParsingExecutionContextView.view(new InMemoryExecutionContext())
.setCharset(StandardCharsets.UTF_8);
JavaParser.fromJavaVersion().build()
.parseInputs(state.getSourceFiles().stream()
.map(sourceFile -> new Parser.Input(sourceFile, () -> {
try {
return new BufferedInputStream(Files.newInputStream(sourceFile));
} catch (IOException e) {
throw new UncheckedIOException(e);
}
})
)
.map(Parser.Input::fromFile)
.collect(toList()),
null,
new InMemoryExecutionContext());
ctx
)
.forEach(bh::consume);
}

public static void main(String[] args) throws RunnerException {
Expand Down
10 changes: 10 additions & 0 deletions rewrite-core/src/main/java/org/openrewrite/Parser.java
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,16 @@ public static Input fromString(Path sourcePath, String source, Charset charset)
return new Input(sourcePath, null, () -> new ByteArrayInputStream(source.getBytes(charset)), true);
}

public static Input fromFile(Path sourcePath) {
return new Input(sourcePath, FileAttributes.fromPath(sourcePath), () -> {
try {
return Files.newInputStream(sourcePath);
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}, false);
}

@SuppressWarnings("unused")
public static Input fromResource(String resource) {
return new Input(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,33 +20,34 @@
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UncheckedIOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;

public class EncodingDetectingInputStream extends InputStream {
private static final Charset WINDOWS_1252 = Charset.forName("Windows-1252");
private static final byte[] UTF8_BOM = new byte[]{(byte) 0xEF, (byte) 0xBB, (byte) 0xBF};

private final InputStream inputStream;

@Nullable
private Charset charset;

private boolean bomChecked;
private boolean charsetBomMarked;

/**
* Last byte read
*/
private int prev;
private int prev2;
private int prev3;

boolean maybeTwoByteSequence = false;
boolean maybeThreeByteSequence = false;
boolean maybeFourByteSequence = false;

public EncodingDetectingInputStream(InputStream inputStream) {
this.inputStream = inputStream;
this.charset = null;
this(inputStream, null);
}

public EncodingDetectingInputStream(InputStream inputStream, @Nullable Charset charset) {
Expand All @@ -64,71 +65,92 @@ public boolean isCharsetBomMarked() {

@Override
public int read() throws IOException {
int aByte = inputStream.read();
int read;
if (!bomChecked) {
if (charset == null || charset == StandardCharsets.UTF_8) {
read = checkAndSkipUtf8Bom();
if (charsetBomMarked) {
read = inputStream.read();
}
} else {
bomChecked = true;
read = inputStream.read();
}
} else {
read = inputStream.read();
}


// if we haven't yet determined a charset...
if (read == -1) {
if (charset == null) {
if (maybeTwoByteSequence || maybeThreeByteSequence || maybeFourByteSequence) {
charset = WINDOWS_1252;
} else {
charset = StandardCharsets.UTF_8;
}
}
} else if (charset == null) {
guessCharset(read);
}
return read;
}

@Override
public int read(byte[] b, int off, int len) throws IOException {
if (charset == null) {
guessCharset(aByte);
// we need to read the bytes one-by-one to determine the encoding
return super.read(b, off, len);
} else if (charset == StandardCharsets.UTF_8 && !bomChecked) {
int read = checkAndSkipUtf8Bom();
if (read == -1) {
return -1;
} else if (!charsetBomMarked) {
b[off++] = (byte) read;
}
read = inputStream.read(b, off, len - 1);
return read == -1 ? charsetBomMarked ? -1 : 1 : (charsetBomMarked ? 0 : 1) + read;
} else {
return inputStream.read(b, off, len);
}
return aByte;
}

private void guessCharset(int aByte) {
if (prev3 == 0xEF && prev2 == 0xBB && prev == 0xBF) {
charsetBomMarked = true;
charset = StandardCharsets.UTF_8;
} else {
if (aByte == -1 || !(prev2 == 0 && prev == 0xEF || prev3 == 0 && prev2 == 0xEF)) {
if (maybeTwoByteSequence) {
if (aByte == -1 && !utf8SequenceEnd(prev) || aByte != -1 && !(utf8SequenceEnd(aByte))) {
charset = WINDOWS_1252;
} else {
maybeTwoByteSequence = false;
prev2 = -1;
prev = -1;
}
} else if (maybeThreeByteSequence) {
if (aByte == -1 ||
utf8SequenceEnd(prev) && !(utf8SequenceEnd(aByte)) ||
!utf8SequenceEnd(aByte)) {
charset = WINDOWS_1252;
}

if (utf8SequenceEnd(prev) && utf8SequenceEnd(aByte)) {
maybeThreeByteSequence = false;
prev2 = -1;
prev = -1;
}
} else if (maybeFourByteSequence) {
if (aByte == -1 ||
utf8SequenceEnd(prev2) && utf8SequenceEnd(prev) && !utf8SequenceEnd(aByte) ||
utf8SequenceEnd(prev) && !utf8SequenceEnd(aByte) ||
!(utf8SequenceEnd(aByte))) {
charset = WINDOWS_1252;
}

if (utf8SequenceEnd(prev2) && utf8SequenceEnd(prev) && utf8SequenceEnd(aByte)) {
maybeFourByteSequence = false;
prev2 = -1;
prev = -1;
}
} else if (utf8TwoByteSequence(aByte)) {
maybeTwoByteSequence = true;
} else if (utf8ThreeByteSequence(aByte)) {
maybeThreeByteSequence = true;
} else if (utf8FourByteSequence(aByte)) {
maybeFourByteSequence = true;
} else if (!utf8TwoByteSequence(prev) && utf8SequenceEnd(aByte)) {
charset = WINDOWS_1252;
}
if (utf8TwoByteSequence(aByte)) {
maybeTwoByteSequence = true;
} else if (utf8ThreeByteSequence(aByte)) {
maybeThreeByteSequence = true;
} else if (utf8FourByteSequence(aByte)) {
maybeFourByteSequence = true;
} else if (maybeTwoByteSequence) {
if (!utf8SequenceEnd(aByte)) {
charset = WINDOWS_1252;
} else {
maybeTwoByteSequence = false;
prev = -1;
}
} else if (maybeThreeByteSequence) {
if (!utf8SequenceEnd(aByte)) {
charset = WINDOWS_1252;
}

if (aByte == -1 && charset == null) {
charset = StandardCharsets.UTF_8;
if (utf8SequenceEnd(prev) && utf8SequenceEnd(aByte)) {
maybeThreeByteSequence = false;
prev = -1;
}
} else if (maybeFourByteSequence) {
if (utf8SequenceEnd(prev2) && utf8SequenceEnd(prev) && !utf8SequenceEnd(aByte) || utf8SequenceEnd(prev) && !utf8SequenceEnd(aByte) || !utf8SequenceEnd(aByte)) {
charset = WINDOWS_1252;
}

if (utf8SequenceEnd(prev2) && utf8SequenceEnd(prev) && utf8SequenceEnd(aByte)) {
maybeFourByteSequence = false;
prev = -1;
}
} else if (utf8SequenceEnd(aByte)) {
charset = WINDOWS_1252;
}

prev3 = prev2;
prev2 = prev;
prev = aByte;
}
Expand All @@ -143,14 +165,36 @@ public synchronized String toString() {
};
byte[] buffer = new byte[4096];
int n;
while ((n = is.read(buffer)) != -1) {
// Note that `is` is this, so the BOM will be checked in `read()`
while ((n = is.read(buffer, 0, buffer.length)) != -1) {
bos.write(buffer, 0, n);
}

return bos.toString();
} catch (IOException e) {
throw new UnsupportedOperationException(e);
throw new UncheckedIOException(e);
}
}

private int checkAndSkipUtf8Bom() throws IOException {
// `Files#newInputStream()` does not need to support mark/reset, so one at the time...
bomChecked = true;
int read = inputStream.read();
if ((byte) read != UTF8_BOM[0]) {
return read;
}
read = inputStream.read();
if ((byte) read != UTF8_BOM[1]) {
return read;
}
read = inputStream.read();
if ((byte) read != UTF8_BOM[2]) {
return read;
}
charsetBomMarked = true;
charset = StandardCharsets.UTF_8;
// return anything other that -1
return -2;
}

// The first byte of a UTF-8 two byte sequence is between 0xC0 - 0xDF.
Expand Down
5 changes: 2 additions & 3 deletions rewrite-core/src/test/java/org/openrewrite/ParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
import org.openrewrite.tree.ParseError;
import org.openrewrite.tree.ParsingExecutionContextView;

import java.io.ByteArrayInputStream;
import java.nio.file.Path;
import java.nio.file.Paths;

Expand Down Expand Up @@ -80,8 +79,8 @@ void printIdempotentDiffPrint() {
line 2
""";

Parser.Input input1 = new Parser.Input(path, () -> new ByteArrayInputStream(before.getBytes()));
Parser.Input input2 = new Parser.Input(path, () -> new ByteArrayInputStream(after.getBytes()));
Parser.Input input1 = Parser.Input.fromString(path, before);
Parser.Input input2 = Parser.Input.fromString(path, after);
SourceFile s1 = parser.parse(input1.getSource(ctx).readFully()).toList().get(0);
SourceFile out = parser.requirePrintEqualsInput(s1, input2, null, ctx);
assertThat(out).isInstanceOf(ParseError.class);
Expand Down
Loading

0 comments on commit c63ab56

Please sign in to comment.