Skip to content

Commit c8e5576

Browse files
authored
Merge pull request #431 from sosy-lab/412-inconsistent-handling-of-unicode-characters-in-string-theory
Fix handling of Unicode characters in String theory
2 parents 2bf59c8 + 2f33842 commit c8e5576

File tree

7 files changed

+155
-79
lines changed

7 files changed

+155
-79
lines changed

src/org/sosy_lab/java_smt/api/StringFormulaManager.java

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import java.util.Arrays;
1212
import java.util.List;
1313
import org.sosy_lab.java_smt.api.NumeralFormula.IntegerFormula;
14+
import org.sosy_lab.java_smt.basicimpl.AbstractStringFormulaManager;
1415

1516
/**
1617
* Manager for dealing with string formulas. Functions come from <a
@@ -21,12 +22,15 @@ public interface StringFormulaManager {
2122
/**
2223
* Creates a {@link StringFormula} representing the given constant String.
2324
*
24-
* <p>This method accepts plain Java Strings with Unicode characters from the Basic Multilingual
25-
* Plane (BMP) (codepoints in range [0x00000, 0x2FFFF]). JavaSMT handles escaping internally, as
26-
* some solvers follow the SMTLIB standard and escape Unicode characters with curly braces.
27-
*
28-
* <p>Additionally, you can use SMTLIB escaping like "\\u{1234}" to represent Unicode characters
29-
* directly.
25+
* <p>The String argument is expected to be a regular Java String and may contain Unicode
26+
* characters from the first 3 planes (codepoints 0x00000-0x2FFFFF). Higher codepoints are not
27+
* allowed due to limitations in SMTLIB. We do not support SMTLIB escape sequences in this method,
28+
* and String like <code>"\\u{abc}"</code> are read verbatim and are not substituted with their
29+
* Unicode character. If you still want to use SMTLIB Strings with this method, the function
30+
* {@link AbstractStringFormulaManager#unescapeUnicodeForSmtlib(String)} can be used to handle the
31+
* conversion before calling this method. Note that you may then also have to use {@link
32+
* AbstractStringFormulaManager#escapeUnicodeForSmtlib(String)} to convert Strings from the model
33+
* back to a format that is compatible with other SMTLIB based solvers.
3034
*
3135
* @param value the string value the returned {@link StringFormula} should represent
3236
* @return a {@link StringFormula} representing the given value
@@ -238,8 +242,8 @@ default RegexFormula concat(RegexFormula... parts) {
238242

239243
/**
240244
* Returns a String formula representing the single character with the given code point, if it is
241-
* a valid Unicode code point within the Basic Multilingual Plane (BMP) (codepoints in range
242-
* [0x00000, 0x2FFFF]). Otherwise, returns the empty string.
245+
* a valid Unicode code point within the Basic Multilingual Plane (BMP) or planes 1 and 2
246+
* (codepoints in range [0x00000, 0x2FFFF]). Otherwise, returns the empty string.
243247
*/
244248
StringFormula fromCodePoint(IntegerFormula codePoint);
245249
}

src/org/sosy_lab/java_smt/basicimpl/AbstractStringFormulaManager.java

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ public abstract class AbstractStringFormulaManager<TFormulaInfo, TType, TEnv, TF
3838
+ "((?<codePoint>[0-9a-fA-F]{4})"
3939
+ "|"
4040
// or curly brackets like "\\u{61}"
41-
+ "(\\{(?<codePointInBrackets>[0-9a-fA-F]{1,5})}))");
41+
+ "(\\{(?<codePointInBrackets>[0-9a-fA-F]{1,5})\\}))");
4242

4343
protected AbstractStringFormulaManager(
4444
FormulaCreator<TFormulaInfo, TType, TEnv, TFuncDecl> pCreator) {
@@ -60,28 +60,29 @@ protected IntegerFormula wrapInteger(TFormulaInfo formulaInfo) {
6060
@Override
6161
public StringFormula makeString(String value) {
6262
checkArgument(
63-
areAllCodePointsInRange(value),
64-
"String constant is out of supported Unicode range (Plane 0-2).");
63+
areAllCodePointsInRange(value, 0, 0x2FFFF),
64+
"String constants may only contain Unicode characters from the first three planes "
65+
+ "(codepoints 0x00000 to 0x2FFFF).");
6566
return wrapString(makeStringImpl(value));
6667
}
6768

68-
/** returns whether all Unicode characters in Planes 0-2. */
69-
private static boolean areAllCodePointsInRange(String str) {
70-
return str.codePoints().allMatch(AbstractStringFormulaManager::isCodePointInRange);
71-
}
72-
73-
private static boolean isCodePointInRange(int codePoint) {
74-
return 0x00000 <= codePoint && codePoint <= 0x2FFFF;
69+
/** Check if the codepoints of all characters in the String are in range. */
70+
private static boolean areAllCodePointsInRange(String str, int lower, int upper) {
71+
return str.codePoints().allMatch(codePoint -> lower <= codePoint && codePoint <= upper);
7572
}
7673

7774
/** Replace Unicode letters in UTF16 representation with their escape sequences. */
78-
protected static String escapeUnicodeForSmtlib(String input) {
75+
public static String escapeUnicodeForSmtlib(String input) {
7976
StringBuilder sb = new StringBuilder();
8077
for (int codePoint : input.codePoints().toArray()) {
81-
if (0x20 <= codePoint && codePoint <= 0x7E) {
78+
if (codePoint == 0x5c) { // 0x5c is s single backslash, as char: '\\'
79+
// Backslashes must be escaped, otherwise they may get substituted when reading back
80+
// the results from the model
81+
sb.append("\\u{5c}");
82+
} else if (0x20 <= codePoint && codePoint <= 0x7E) {
8283
sb.appendCodePoint(codePoint); // normal printable chars
8384
} else {
84-
sb.append("\\u{").append(String.format("%05X", codePoint)).append("}");
85+
sb.append("\\u{").append(Integer.toHexString(codePoint)).append("}");
8586
}
8687
}
8788
return sb.toString();
@@ -98,9 +99,15 @@ public static String unescapeUnicodeForSmtlib(String input) {
9899
}
99100
int codePoint = Integer.parseInt(hexCodePoint, 16);
100101
checkArgument(
101-
isCodePointInRange(codePoint),
102+
0 <= codePoint && codePoint <= 0x2FFFF,
102103
"SMTLIB does only specify Unicode letters from Planes 0-2");
103-
matcher.appendReplacement(sb, Character.toString(codePoint));
104+
String replacement = Character.toString(codePoint);
105+
if (replacement.equals("\\")) {
106+
// Matcher.appendReplacement considers '\' as special character.
107+
// Substitute with '\\' instead
108+
replacement = "\\\\";
109+
}
110+
matcher.appendReplacement(sb, replacement);
104111
}
105112
matcher.appendTail(sb);
106113
return sb.toString();

src/org/sosy_lab/java_smt/solvers/cvc4/CVC4StringFormulaManager.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ class CVC4StringFormulaManager extends AbstractStringFormulaManager<Expr, Type,
2929

3030
@Override
3131
protected Expr makeStringImpl(String pValue) {
32-
// The boolean enables escape characters!
3332
return exprManager.mkConst(new CVC4String(escapeUnicodeForSmtlib(pValue), true));
3433
}
3534

src/org/sosy_lab/java_smt/solvers/princess/PrincessStringFormulaManager.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@ public class PrincessStringFormulaManager
3737

3838
@Override
3939
protected IExpression makeStringImpl(String value) {
40-
value = unescapeUnicodeForSmtlib(value);
4140
checkArgument(!containsSurrogatePair(value), "Princess does not support surrogate pairs.");
4241
IExpression strExpr = PrincessEnvironment.stringTheory.string2Term(value);
4342
return getFormulaCreator().getEnv().simplify(strExpr); // simplify MOD in chars

src/org/sosy_lab/java_smt/solvers/z3/Z3FormulaCreator.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
package org.sosy_lab.java_smt.solvers.z3;
1010

1111
import static com.google.common.base.Preconditions.checkArgument;
12-
import static org.sosy_lab.java_smt.basicimpl.AbstractStringFormulaManager.unescapeUnicodeForSmtlib;
1312

1413
import com.google.common.base.Preconditions;
1514
import com.google.common.collect.HashBasedTable;
@@ -58,6 +57,7 @@
5857
import org.sosy_lab.java_smt.api.SolverException;
5958
import org.sosy_lab.java_smt.api.StringFormula;
6059
import org.sosy_lab.java_smt.api.visitors.FormulaVisitor;
60+
import org.sosy_lab.java_smt.basicimpl.AbstractStringFormulaManager;
6161
import org.sosy_lab.java_smt.basicimpl.FormulaCreator;
6262
import org.sosy_lab.java_smt.basicimpl.FunctionDeclarationImpl;
6363
import org.sosy_lab.java_smt.solvers.z3.Z3Formula.Z3ArrayFormula;
@@ -933,7 +933,8 @@ public Object convertValue(Long value) {
933933
Rational ratValue = Rational.ofString(Native.getNumeralString(environment, value));
934934
return ratValue.isIntegral() ? ratValue.getNum() : ratValue;
935935
} else if (type.isStringType()) {
936-
return unescapeUnicodeForSmtlib(Native.getString(environment, value));
936+
String str = Native.getString(environment, value);
937+
return AbstractStringFormulaManager.unescapeUnicodeForSmtlib(str);
937938
} else if (type.isBitvectorType()) {
938939
return new BigInteger(Native.getNumeralString(environment, value));
939940
} else if (type.isFloatingPointType()) {

src/org/sosy_lab/java_smt/test/ModelEvaluationTest.java

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import org.sosy_lab.java_smt.api.ProverEnvironment;
2626
import org.sosy_lab.java_smt.api.SolverContext.ProverOptions;
2727
import org.sosy_lab.java_smt.api.SolverException;
28+
import org.sosy_lab.java_smt.basicimpl.AbstractStringFormulaManager;
2829

2930
/** Test that we can request evaluations from models. */
3031
public class ModelEvaluationTest extends SolverBasedTest0.ParameterizedSolverBasedTest0 {
@@ -168,19 +169,21 @@ public void testGetStringsEvaluation() throws SolverException, InterruptedExcept
168169

169170
// Unicode
170171
evaluateInModel(
171-
smgr.equal(smgr.makeVariable("x"), smgr.makeString("hello æ@€ \u1234 \\u{4321}")),
172+
smgr.equal(
173+
smgr.makeVariable("x"),
174+
smgr.makeString(
175+
AbstractStringFormulaManager.unescapeUnicodeForSmtlib(
176+
"hello æ@€ \u1234 \\u{4321}"))),
172177
smgr.makeVariable("x"),
173178
Lists.newArrayList("hello \u00e6@\u20ac \u1234 \u4321"),
174179
Lists.newArrayList(smgr.makeString("hello \u00e6@\u20ac \u1234 \u4321")));
175180

176-
// TODO Z3 and CVC4 seem to break escaping on invalid Unicode Strings.
177-
/*
178-
evaluateInModel(
181+
// invalid Unicode escape sequences (should be treated as normal characters)
182+
evaluateInModel(
179183
smgr.equal(smgr.makeVariable("x"), smgr.makeString("\\u")),
180184
smgr.makeVariable("x"),
181185
Lists.newArrayList("\\u"),
182186
Lists.newArrayList(smgr.makeString("\\u")));
183-
*/
184187

185188
// foreign variable: x vs y
186189
evaluateInModel(

0 commit comments

Comments
 (0)