Commit 532ef347 authored by iwakeh's avatar iwakeh Committed by Karsten Loesing
Browse files

Only unescape valid UTF.

Add a utility method for only un-escaping valid utf and supply a test
as well as test data for this issue.

Fixes task-22594.
parent e7ac4ca9
# Changes in version 5.1-1.12.0 - 2018-??-??
* Minor changes
- Don't attempt to un-escape character sequences in contact lines
(like "\uk") that only happen to start like escaped utf-8 characters
(like "\u0055").
# Changes in version 5.1-1.11.0 - 2018-03-14
* Medium changes
......
......@@ -9,7 +9,6 @@ import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.JsonParseException;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -318,8 +317,7 @@ public class DocumentStore {
* objects are escaped JSON, e.g., \u00F2. When Gson serlializes
* this string, it escapes the \ to \\, hence writes \\u00F2. We
* need to undo this and change \\u00F2 back to \u00F2. */
documentString = StringUtils.replace(gson.toJson(document),
"\\\\u", "\\u");
documentString = FormattingUtils.replaceValidUtf(gson.toJson(document));
/* Existing details statuses don't contain opening and closing curly
* brackets, so we should remove them from new details statuses,
* too. */
......
......@@ -12,12 +12,11 @@ import org.torproject.onionoo.docs.DocumentStoreFactory;
import org.torproject.onionoo.docs.SummaryDocument;
import org.torproject.onionoo.docs.UptimeDocument;
import org.torproject.onionoo.docs.WeightsDocument;
import org.torproject.onionoo.util.FormattingUtils;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -348,7 +347,7 @@ public class ResponseBuilder {
/* Whenever we provide Gson with a string containing an escaped
* non-ASCII character like \u00F2, it escapes the \ to \\, which
* we need to undo before including the string in a response. */
return StringUtils.replace(gson.toJson(dd), "\\\\u", "\\u");
return FormattingUtils.replaceValidUtf(gson.toJson(dd));
} else {
// TODO We should probably log that we didn't find a details
// document that we expected to exist.
......
......@@ -3,8 +3,18 @@
package org.torproject.onionoo.util;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/** Static helper methods for string processing etc. */
public class FormattingUtils {
private static Logger log = LoggerFactory.getLogger(
FormattingUtils.class);
private FormattingUtils() {
}
......@@ -35,5 +45,29 @@ public class FormattingUtils {
public static String formatDecimalNumber(long decimalNumber) {
return String.format("%,d", decimalNumber);
}
private static Pattern escapePattern = Pattern.compile(
"(\\\\{4}u[0-9a-fA-F]{4})");
/** De-escape only valid UTF and leave anything else escaped. */
public static String replaceValidUtf(String text) {
if (null == text || text.isEmpty()) {
return text;
}
try {
StringBuffer sb = new StringBuffer();
Matcher mat = escapePattern.matcher(text);
while (mat.find()) {
String unescaped = mat.group(1);
mat.appendReplacement(sb, unescaped);
}
mat.appendTail(sb);
return sb.toString();
} catch (Throwable ex) {
log.debug("Couldn't process input '{}'.", text, ex);
return text;
}
}
}
package org.torproject.onionoo.util;
import static org.junit.Assert.assertEquals;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
import org.junit.runners.Parameterized.Parameter;
import org.junit.runners.Parameterized.Parameters;
import java.io.File;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
@RunWith(Parameterized.class)
public class FormattingUtilsTest {
/** Provide test data. */
@Parameters
public static Collection<String[]> data() throws Exception {
List<String> lines = Files.readAllLines((new File(ClassLoader
.getSystemResource("lines-for-escape-tests.txt").toURI()))
.toPath());
List<String[]> testData = new ArrayList<>();
for (int i = 0; i < lines.size(); i += 2) {
testData.add(new String[]{lines.get(i), lines.get(i + 1)});
}
return testData;
}
@Parameter(0)
public String in;
@Parameter(1)
public String out;
@Test
public void testReplaceUtf() {
assertEquals(out, new String(FormattingUtils.replaceValidUtf(in)));
}
}
abc
abc
\\\\u
\\\\u
Haha/\\\\@/\\\\live/\\\\./\\\\co/\\\\./\\\uk
Haha/\\\\@/\\\\live/\\\\./\\\\co/\\\\./\\\uk
\\\\u20ac
\\u20ac
\\\\u0024
\\u0024
some \\\\u20ac other string \\\\u0024 to unescape
some \\u20ac other string \\u0024 to unescape
abcd efg\\\\u0024xyz\\\\uxxxx
abcd efg\\u0024xyz\\\\uxxxx
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment