diff -Nru jsoup-1.8.2/CHANGES jsoup-1.8.3/CHANGES --- jsoup-1.8.2/CHANGES 2015-04-14 04:32:24.000000000 +0000 +++ jsoup-1.8.3/CHANGES 2015-08-02 20:18:18.000000000 +0000 @@ -1,6 +1,41 @@ jsoup changelog -*** Release 1.8.2 [PENDING] +*** Release 1.8.3 [2015-Aug-02] + * Added support for custom boolean attributes. + + + * When fetching XML URLs, automatically switch to the XML parser instead of the HTML parser. + + + * Performance improvement on parsing larger HTML pages. On Android KitKat, around 1.7x times faster. On Android + Lollipop, ~ 1.3x faster. Improvements largely from re-ordering the HtmlTreeBuilder methods based on analysis of + various websites; also from further memory reduction for nodes with no children, and other tweaks. + + * Fixed an issue in Element.getElementSiblingIndex (and related methods) where sibling elements with the same content + would incorrectly have the same sibling index. + + + * Fixed an issue where unexpected elements in a badly nested table could be moved to the wrong location in the + document. + + + * Fixed an issue where a table nested within a TH cell would parse to an incorrect tree. + + + * When serializing a document using the XHTML encoding entities, if the character set did not support   chars + (such as Shift_JIS), the character would be skipped. For visibility, will now always output &xa0; when using XHTML + encoding entities (as   is not defined), regardless of the output character set. + + + * Fixed an issue when resolving URLs, if the absolute URL had no path, the relative URL was not normalized correctly. + Also fixed an issue where connections that were redirected to a relative URL did not have the same normalization + rules as a URL read from Nodes.absUrl(String). + + + * When serialising XML, ensure that '<' characters in attributes are escaped, per spec. Not required in HTML. + + +*** Release 1.8.2 [2015-Apr-13] * Performance improvements for parsing HTML on Android, of 1.5x to 1.9x, with larger parses getting a bigger speed increase. For non-Android JREs, around 1.1x to 1.2x. diff -Nru jsoup-1.8.2/debian/changelog jsoup-1.8.3/debian/changelog --- jsoup-1.8.2/debian/changelog 2015-04-29 13:19:38.000000000 +0000 +++ jsoup-1.8.3/debian/changelog 2015-08-29 21:43:46.000000000 +0000 @@ -1,3 +1,10 @@ +jsoup (1.8.3-1) unstable; urgency=medium + + * New upstream release (Closes: #797275) + * Refreshed the patch + + -- Emmanuel Bourg Sat, 29 Aug 2015 22:40:04 +0200 + jsoup (1.8.2-1) unstable; urgency=medium * New upstream release diff -Nru jsoup-1.8.2/debian/maven.ignoreRules jsoup-1.8.3/debian/maven.ignoreRules --- jsoup-1.8.2/debian/maven.ignoreRules 2015-04-29 13:20:07.000000000 +0000 +++ jsoup-1.8.3/debian/maven.ignoreRules 2015-08-29 21:06:46.000000000 +0000 @@ -14,5 +14,6 @@ # from the POM # junit junit jar s/3\\..*/3.x/ +org.apache.maven.plugins maven-release-plugin * * * * org.apache.maven.plugins maven-source-plugin * * * * org.codehaus.mojo animal-sniffer-maven-plugin * * * * diff -Nru jsoup-1.8.2/debian/patches/dfsg-free-test-data.patch jsoup-1.8.3/debian/patches/dfsg-free-test-data.patch --- jsoup-1.8.2/debian/patches/dfsg-free-test-data.patch 2015-04-29 13:17:34.000000000 +0000 +++ jsoup-1.8.3/debian/patches/dfsg-free-test-data.patch 2015-08-29 20:40:50.000000000 +0000 @@ -1287,7 +1287,7 @@ + --- a/src/test/java/org/jsoup/nodes/DocumentTest.java +++ b/src/test/java/org/jsoup/nodes/DocumentTest.java -@@ -91,7 +91,7 @@ +@@ -93,7 +93,7 @@ TextUtil.stripNewlines(clone.html())); } diff -Nru jsoup-1.8.2/pom.xml jsoup-1.8.3/pom.xml --- jsoup-1.8.2/pom.xml 2015-04-14 04:32:24.000000000 +0000 +++ jsoup-1.8.3/pom.xml 2015-08-02 20:18:18.000000000 +0000 @@ -5,7 +5,7 @@ org.jsoup jsoup - 1.8.2 + 1.8.3 jsoup HTML parser http://jsoup.org/ 2009 @@ -24,6 +24,7 @@ http://github.com/jhy/jsoup scm:git:http://github.com/jhy/jsoup.git + jsoup-1.8.3a Jonathan Hedley @@ -130,6 +131,10 @@ maven-resources-plugin 2.4 + + maven-release-plugin + 2.5.2 + diff -Nru jsoup-1.8.2/release.properties jsoup-1.8.3/release.properties --- jsoup-1.8.2/release.properties 1970-01-01 00:00:00.000000000 +0000 +++ jsoup-1.8.3/release.properties 2015-08-02 20:18:18.000000000 +0000 @@ -0,0 +1,11 @@ +#release configuration +#Sun Aug 02 13:14:42 PDT 2015 +scm.tagNameFormat=@{project.artifactId}-@{project.version} +pushChanges=true +scm.url=scm\:git\:http\://github.com/jhy/jsoup.git +preparationGoals=clean verify +projectVersionPolicyId=default +remoteTagging=true +scm.commentPrefix=[maven-release-plugin] +exec.snapshotReleasePluginAllowed=false +completedPhase=check-poms diff -Nru jsoup-1.8.2/src/main/java/org/jsoup/Connection.java jsoup-1.8.3/src/main/java/org/jsoup/Connection.java --- jsoup-1.8.2/src/main/java/org/jsoup/Connection.java 2015-04-14 04:32:24.000000000 +0000 +++ jsoup-1.8.3/src/main/java/org/jsoup/Connection.java 2015-08-02 20:18:18.000000000 +0000 @@ -1,6 +1,7 @@ package org.jsoup; import org.jsoup.nodes.Document; +import org.jsoup.parser.HtmlTreeBuilder; import org.jsoup.parser.Parser; import java.io.IOException; @@ -26,12 +27,12 @@ /** * GET and POST http methods. */ - public enum Method { + enum Method { GET(false), POST(true), PUT(true), DELETE(false), PATCH(true); private final boolean hasBody; - private Method(boolean hasBody) { + Method(boolean hasBody) { this.hasBody = hasBody; } @@ -49,21 +50,21 @@ * @param url URL to connect to * @return this Connection, for chaining */ - public Connection url(URL url); + Connection url(URL url); /** * Set the request URL to fetch. The protocol must be HTTP or HTTPS. * @param url URL to connect to * @return this Connection, for chaining */ - public Connection url(String url); + Connection url(String url); /** * Set the request user-agent header. * @param userAgent user-agent to use * @return this Connection, for chaining */ - public Connection userAgent(String userAgent); + Connection userAgent(String userAgent); /** * Set the request timeouts (connect and read). If a timeout occurs, an IOException will be thrown. The default @@ -71,7 +72,7 @@ * @param millis number of milliseconds (thousandths of a second) before timing out connects or reads. * @return this Connection, for chaining */ - public Connection timeout(int millis); + Connection timeout(int millis); /** * Set the maximum bytes to read from the (uncompressed) connection into the body, before the connection is closed, @@ -80,28 +81,28 @@ * @param bytes number of bytes to read from the input before truncating * @return this Connection, for chaining */ - public Connection maxBodySize(int bytes); + Connection maxBodySize(int bytes); /** * Set the request referrer (aka "referer") header. * @param referrer referrer to use * @return this Connection, for chaining */ - public Connection referrer(String referrer); + Connection referrer(String referrer); /** * Configures the connection to (not) follow server redirects. By default this is true. * @param followRedirects true if server redirects should be followed. * @return this Connection, for chaining */ - public Connection followRedirects(boolean followRedirects); + Connection followRedirects(boolean followRedirects); /** * Set the request method to use, GET or POST. Default is GET. * @param method HTTP request method * @return this Connection, for chaining */ - public Connection method(Method method); + Connection method(Method method); /** * Configures the connection to not throw exceptions when a HTTP error occurs. (4xx - 5xx, e.g. 404 or 500). By @@ -110,7 +111,7 @@ * @param ignoreHttpErrors - false (default) if HTTP errors should be ignored. * @return this Connection, for chaining */ - public Connection ignoreHttpErrors(boolean ignoreHttpErrors); + Connection ignoreHttpErrors(boolean ignoreHttpErrors); /** * Ignore the document's Content-Type when parsing the response. By default this is false, an unrecognised @@ -120,7 +121,7 @@ * Document. * @return this Connection, for chaining */ - public Connection ignoreContentType(boolean ignoreContentType); + Connection ignoreContentType(boolean ignoreContentType); /** * Disable/enable TSL certificates validation for HTTPS requests. @@ -148,7 +149,7 @@ * @param value data value * @return this Connection, for chaining */ - public Connection data(String key, String value); + Connection data(String key, String value); /** * Add an input stream as a request data paramater. For GETs, has no effect, but for POSTS this will upload the @@ -160,21 +161,21 @@ * You must close the InputStream in a {@code finally} block. * @return this Connections, for chaining */ - public Connection data(String key, String filename, InputStream inputStream); + Connection data(String key, String filename, InputStream inputStream); /** * Adds all of the supplied data to the request data parameters * @param data collection of data parameters * @return this Connection, for chaining */ - public Connection data(Collection data); + Connection data(Collection data); /** * Adds all of the supplied data to the request data parameters * @param data map of data parameters * @return this Connection, for chaining */ - public Connection data(Map data); + Connection data(Map data); /** * Add a number of request data parameters. Multiple parameters may be set at once, e.g.: .data("name", @@ -183,7 +184,7 @@ * @param keyvals a set of key value pairs. * @return this Connection, for chaining */ - public Connection data(String... keyvals); + Connection data(String... keyvals); /** * Set a request header. @@ -192,7 +193,7 @@ * @return this Connection, for chaining * @see org.jsoup.Connection.Request#headers() */ - public Connection header(String name, String value); + Connection header(String name, String value); /** * Set a cookie to be sent in the request. @@ -200,28 +201,29 @@ * @param value value of cookie * @return this Connection, for chaining */ - public Connection cookie(String name, String value); + Connection cookie(String name, String value); /** * Adds each of the supplied cookies to the request. * @param cookies map of cookie name {@literal ->} value pairs * @return this Connection, for chaining */ - public Connection cookies(Map cookies); + Connection cookies(Map cookies); /** - * Provide an alternate parser to use when parsing the response to a Document. + * Provide an alternate parser to use when parsing the response to a Document. If not set, defaults to the HTML + * parser, unless the response content-type is XML, in which case the XML parser is used. * @param parser alternate parser * @return this Connection, for chaining */ - public Connection parser(Parser parser); + Connection parser(Parser parser); /** * Sets the default post data character set for x-www-form-urlencoded post data * @param charset character set to encode post data * @return this Connection, for chaining */ - public Connection postDataCharset(String charset); + Connection postDataCharset(String charset); /** * Execute the request as a GET, and parse the result. @@ -232,7 +234,7 @@ * @throws java.net.SocketTimeoutException if the connection times out * @throws IOException on error */ - public Document get() throws IOException; + Document get() throws IOException; /** * Execute the request as a POST, and parse the result. @@ -243,7 +245,7 @@ * @throws java.net.SocketTimeoutException if the connection times out * @throws IOException on error */ - public Document post() throws IOException; + Document post() throws IOException; /** * Execute the request. @@ -254,33 +256,33 @@ * @throws java.net.SocketTimeoutException if the connection times out * @throws IOException on error */ - public Response execute() throws IOException; + Response execute() throws IOException; /** * Get the request object associated with this connection * @return request */ - public Request request(); + Request request(); /** * Set the connection's request * @param request new request object * @return this Connection, for chaining */ - public Connection request(Request request); + Connection request(Request request); /** * Get the response, once the request has been executed * @return response */ - public Response response(); + Response response(); /** * Set the connection's response * @param response new response * @return this Connection, for chaining */ - public Connection response(Response response); + Connection response(Response response); /** * Common methods for Requests and Responses @@ -292,27 +294,27 @@ * Get the URL * @return URL */ - public URL url(); + URL url(); /** * Set the URL * @param url new URL * @return this, for chaining */ - public T url(URL url); + T url(URL url); /** * Get the request method * @return method */ - public Method method(); + Method method(); /** * Set the request method * @param method new method * @return this, for chaining */ - public T method(Method method); + T method(Method method); /** * Get the value of a header. This is a simplified header model, where a header may only have one value. @@ -324,7 +326,7 @@ * @see #hasHeader(String) * @see #cookie(String) */ - public String header(String name); + String header(String name); /** * Set a header. This method will overwrite any existing header with the same case insensitive name. @@ -332,14 +334,14 @@ * @param value Value of header * @return this, for chaining */ - public T header(String name, String value); + T header(String name, String value); /** * Check if a header is present * @param name name of header (case insensitive) * @return if the header is present in this request/response */ - public boolean hasHeader(String name); + boolean hasHeader(String name); /** * Check if a header is present, with the given value @@ -347,20 +349,20 @@ * @param value value (case insensitive) * @return if the header and value pair are set in this req/res */ - public boolean hasHeaderWithValue(String name, String value); + boolean hasHeaderWithValue(String name, String value); /** * Remove a header by name * @param name name of header to remove (case insensitive) * @return this, for chaining */ - public T removeHeader(String name); + T removeHeader(String name); /** * Retrieve all of the request/response headers as a map * @return headers */ - public Map headers(); + Map headers(); /** * Get a cookie value by name from this request/response. @@ -371,7 +373,7 @@ * @param name name of cookie to retrieve. * @return value of cookie, or null if not set */ - public String cookie(String name); + String cookie(String name); /** * Set a cookie in this request/response. @@ -379,101 +381,101 @@ * @param value value of cookie * @return this, for chaining */ - public T cookie(String name, String value); + T cookie(String name, String value); /** * Check if a cookie is present * @param name name of cookie * @return if the cookie is present in this request/response */ - public boolean hasCookie(String name); + boolean hasCookie(String name); /** * Remove a cookie by name * @param name name of cookie to remove * @return this, for chaining */ - public T removeCookie(String name); + T removeCookie(String name); /** * Retrieve all of the request/response cookies as a map * @return cookies */ - public Map cookies(); + Map cookies(); } /** * Represents a HTTP request. */ - public interface Request extends Base { + interface Request extends Base { /** * Get the request timeout, in milliseconds. * @return the timeout in milliseconds. */ - public int timeout(); + int timeout(); /** * Update the request timeout. * @param millis timeout, in milliseconds * @return this Request, for chaining */ - public Request timeout(int millis); + Request timeout(int millis); /** * Get the maximum body size, in bytes. * @return the maximum body size, in bytes. */ - public int maxBodySize(); + int maxBodySize(); /** * Update the maximum body size, in bytes. * @param bytes maximum body size, in bytes. * @return this Request, for chaining */ - public Request maxBodySize(int bytes); + Request maxBodySize(int bytes); /** * Get the current followRedirects configuration. * @return true if followRedirects is enabled. */ - public boolean followRedirects(); + boolean followRedirects(); /** * Configures the request to (not) follow server redirects. By default this is true. * @param followRedirects true if server redirects should be followed. * @return this Request, for chaining */ - public Request followRedirects(boolean followRedirects); + Request followRedirects(boolean followRedirects); /** * Get the current ignoreHttpErrors configuration. * @return true if errors will be ignored; false (default) if HTTP errors will cause an IOException to be * thrown. */ - public boolean ignoreHttpErrors(); + boolean ignoreHttpErrors(); /** * Configures the request to ignore HTTP errors in the response. * @param ignoreHttpErrors set to true to ignore HTTP errors. * @return this Request, for chaining */ - public Request ignoreHttpErrors(boolean ignoreHttpErrors); + Request ignoreHttpErrors(boolean ignoreHttpErrors); /** * Get the current ignoreContentType configuration. * @return true if invalid content-types will be ignored; false (default) if they will cause an IOException to * be thrown. */ - public boolean ignoreContentType(); + boolean ignoreContentType(); /** * Configures the request to ignore the Content-Type of the response. * @param ignoreContentType set to true to ignore the content type. * @return this Request, for chaining */ - public Request ignoreContentType(boolean ignoreContentType); + Request ignoreContentType(boolean ignoreContentType); /** * Get the current state of TLS (SSL) certificate validation. @@ -492,139 +494,139 @@ * @param keyval data to add. * @return this Request, for chaining */ - public Request data(KeyVal keyval); + Request data(KeyVal keyval); /** * Get all of the request's data parameters * @return collection of keyvals */ - public Collection data(); + Collection data(); /** * Specify the parser to use when parsing the document. * @param parser parser to use. * @return this Request, for chaining */ - public Request parser(Parser parser); + Request parser(Parser parser); /** * Get the current parser to use when parsing the document. * @return current Parser */ - public Parser parser(); + Parser parser(); /** * Sets the post data character set for x-www-form-urlencoded post data * @param charset character set to encode post data * @return this Request, for chaining */ - public Request postDataCharset(String charset); + Request postDataCharset(String charset); /** * Gets the post data character set for x-www-form-urlencoded post data * @return character set to encode post data */ - public String postDataCharset(); + String postDataCharset(); } /** * Represents a HTTP response. */ - public interface Response extends Base { + interface Response extends Base { /** * Get the status code of the response. * @return status code */ - public int statusCode(); + int statusCode(); /** * Get the status message of the response. * @return status message */ - public String statusMessage(); + String statusMessage(); /** * Get the character set name of the response. * @return character set name */ - public String charset(); + String charset(); /** * Get the response content type (e.g. "text/html"); * @return the response content type */ - public String contentType(); + String contentType(); /** * Parse the body of the response as a Document. * @return a parsed Document * @throws IOException on error */ - public Document parse() throws IOException; + Document parse() throws IOException; /** * Get the body of the response as a plain string. * @return body */ - public String body(); + String body(); /** * Get the body of the response as an array of bytes. * @return body bytes */ - public byte[] bodyAsBytes(); + byte[] bodyAsBytes(); } /** * A Key Value tuple. */ - public interface KeyVal { + interface KeyVal { /** * Update the key of a keyval * @param key new key * @return this KeyVal, for chaining */ - public KeyVal key(String key); + KeyVal key(String key); /** * Get the key of a keyval * @return the key */ - public String key(); + String key(); /** * Update the value of a keyval * @param value the new value * @return this KeyVal, for chaining */ - public KeyVal value(String value); + KeyVal value(String value); /** * Get the value of a keyval * @return the value */ - public String value(); + String value(); /** * Add or update an input stream to this keyVal * @param inputStream new input stream * @return this KeyVal, for chaining */ - public KeyVal inputStream(InputStream inputStream); + KeyVal inputStream(InputStream inputStream); /** * Get the input stream associated with this keyval, if any * @return input stream if set, or null */ - public InputStream inputStream(); + InputStream inputStream(); /** * Does this keyval have an input stream? * @return true if this keyval does indeed have an input stream */ - public boolean hasInputStream(); + boolean hasInputStream(); } } diff -Nru jsoup-1.8.2/src/main/java/org/jsoup/helper/DataUtil.java jsoup-1.8.3/src/main/java/org/jsoup/helper/DataUtil.java --- jsoup-1.8.2/src/main/java/org/jsoup/helper/DataUtil.java 2015-04-14 04:32:24.000000000 +0000 +++ jsoup-1.8.3/src/main/java/org/jsoup/helper/DataUtil.java 2015-08-02 20:18:18.000000000 +0000 @@ -162,8 +162,7 @@ } outStream.write(buffer, 0, read); } - ByteBuffer byteData = ByteBuffer.wrap(outStream.toByteArray()); - return byteData; + return ByteBuffer.wrap(outStream.toByteArray()); } static ByteBuffer readToByteBuffer(InputStream inStream) throws IOException { diff -Nru jsoup-1.8.2/src/main/java/org/jsoup/helper/HttpConnection.java jsoup-1.8.3/src/main/java/org/jsoup/helper/HttpConnection.java --- jsoup-1.8.2/src/main/java/org/jsoup/helper/HttpConnection.java 2015-04-14 04:32:24.000000000 +0000 +++ jsoup-1.8.3/src/main/java/org/jsoup/helper/HttpConnection.java 2015-08-02 20:18:18.000000000 +0000 @@ -357,6 +357,7 @@ private boolean ignoreHttpErrors = false; private boolean ignoreContentType = false; private Parser parser; + private boolean parserDefined = false; // called parser(...) vs initialized in ctor private boolean validateTSLCertificates = true; private String postDataCharset = DataUtil.defaultCharset; @@ -437,6 +438,7 @@ public Request parser(Parser parser) { this.parser = parser; + parserDefined = true; return this; } @@ -470,11 +472,9 @@ private Connection.Request req; /* - * For example {@code application/atom+xml;charset=utf-8}. - * Stepping through it: start with {@code "application/"}, follow with word - * characters up to a {@code "+xml"}, and then maybe more ({@code .*}). + * Matches XML content types (like text/xml, application/xhtml+xml;charset=UTF8, etc) */ - private static final Pattern xmlContentTypeRxp = Pattern.compile("application/\\w+\\+xml.*"); + private static final Pattern xmlContentTypeRxp = Pattern.compile("(application|text)/\\w*\\+?xml.*"); Response() { super(); @@ -526,7 +526,7 @@ String location = res.header(LOCATION); if (location != null && location.startsWith("http:/") && location.charAt(6) != '/') // fix broken Location: http:/temp/AAG_New/en/index.php location = location.substring(6); - req.url(new URL(req.url(), encodeUrl(location))); + req.url(StringUtil.resolve(req.url(), encodeUrl(location))); for (Map.Entry cookie : res.cookies.entrySet()) { // add response cookies to request (for e.g. login posts) req.cookie(cookie.getKey(), cookie.getValue()); @@ -541,12 +541,19 @@ if (contentType != null && !req.ignoreContentType() && !contentType.startsWith("text/") - && !contentType.startsWith("application/xml") && !xmlContentTypeRxp.matcher(contentType).matches() ) throw new UnsupportedMimeTypeException("Unhandled content type. Must be text/*, application/xml, or application/xhtml+xml", contentType, req.url().toString()); + // switch to the XML parser if content type is xml and not parser not explicitly set + if (contentType != null && xmlContentTypeRxp.matcher(contentType).matches()) { + // only flip it if a HttpConnection.Request (i.e. don't presume other impls want it): + if (req instanceof HttpConnection.Request && !((Request) req).parserDefined) { + req.parser(Parser.xmlParser()); + } + } + res.charset = DataUtil.getCharsetFromContentType(res.contentType); // may be null, readInputStream deals with it if (conn.getContentLength() != 0) { // -1 means unknown, chunked. sun throws an IO exception on 500 response with no content when trying to read body InputStream bodyStream = null; diff -Nru jsoup-1.8.2/src/main/java/org/jsoup/helper/StringUtil.java jsoup-1.8.3/src/main/java/org/jsoup/helper/StringUtil.java --- jsoup-1.8.2/src/main/java/org/jsoup/helper/StringUtil.java 2015-04-14 04:32:24.000000000 +0000 +++ jsoup-1.8.3/src/main/java/org/jsoup/helper/StringUtil.java 2015-08-02 20:18:18.000000000 +0000 @@ -1,5 +1,8 @@ package org.jsoup.helper; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Arrays; import java.util.Collection; import java.util.Iterator; @@ -150,4 +153,49 @@ } return false; } + + public static boolean inSorted(String needle, String[] haystack) { + return Arrays.binarySearch(haystack, needle) >= 0; + } + + /** + * Create a new absolute URL, from a provided existing absolute URL and a relative URL component. + * @param base the existing absolulte base URL + * @param relUrl the relative URL to resolve. (If it's already absolute, it will be returned) + * @return the resolved absolute URL + * @throws MalformedURLException if an error occurred generating the URL + */ + public static URL resolve(URL base, String relUrl) throws MalformedURLException { + // workaround: java resolves '//path/file + ?foo' to '//path/?foo', not '//path/file?foo' as desired + if (relUrl.startsWith("?")) + relUrl = base.getPath() + relUrl; + // workaround: //example.com + ./foo = //example.com/./foo, not //example.com/foo + if (relUrl.indexOf('.') == 0 && base.getFile().indexOf('/') != 0) { + base = new URL(base.getProtocol(), base.getHost(), base.getPort(), "/" + base.getFile()); + } + return new URL(base, relUrl); + } + + /** + * Create a new absolute URL, from a provided existing absolute URL and a relative URL component. + * @param baseUrl the existing absolute base URL + * @param relUrl the relative URL to resolve. (If it's already absolute, it will be returned) + * @return an absolute URL if one was able to be generated, or the empty string if not + */ + public static String resolve(final String baseUrl, final String relUrl) { + URL base; + try { + try { + base = new URL(baseUrl); + } catch (MalformedURLException e) { + // the base is unsuitable, but the attribute/rel may be abs on its own, so try that + URL abs = new URL(relUrl); + return abs.toExternalForm(); + } + return resolve(base, relUrl).toExternalForm(); + } catch (MalformedURLException e) { + return ""; + } + + } } diff -Nru jsoup-1.8.2/src/main/java/org/jsoup/nodes/Attribute.java jsoup-1.8.3/src/main/java/org/jsoup/nodes/Attribute.java --- jsoup-1.8.2/src/main/java/org/jsoup/nodes/Attribute.java 2015-04-14 04:32:24.000000000 +0000 +++ jsoup-1.8.3/src/main/java/org/jsoup/nodes/Attribute.java 2015-08-02 20:18:18.000000000 +0000 @@ -121,7 +121,11 @@ protected final boolean shouldCollapseAttribute(Document.OutputSettings out) { return ("".equals(value) || value.equalsIgnoreCase(key)) && out.syntax() == Document.OutputSettings.Syntax.html - && Arrays.binarySearch(booleanAttributes, key) >= 0; + && isBooleanAttribute(); + } + + protected boolean isBooleanAttribute() { + return Arrays.binarySearch(booleanAttributes, key) >= 0; } @Override @@ -132,9 +136,7 @@ Attribute attribute = (Attribute) o; if (key != null ? !key.equals(attribute.key) : attribute.key != null) return false; - if (value != null ? !value.equals(attribute.value) : attribute.value != null) return false; - - return true; + return !(value != null ? !value.equals(attribute.value) : attribute.value != null); } @Override diff -Nru jsoup-1.8.2/src/main/java/org/jsoup/nodes/Attributes.java jsoup-1.8.3/src/main/java/org/jsoup/nodes/Attributes.java --- jsoup-1.8.2/src/main/java/org/jsoup/nodes/Attributes.java 2015-04-14 04:32:24.000000000 +0000 +++ jsoup-1.8.3/src/main/java/org/jsoup/nodes/Attributes.java 2015-08-02 20:18:18.000000000 +0000 @@ -48,6 +48,18 @@ Attribute attr = new Attribute(key, value); put(attr); } + + /** + Set a new boolean attribute, remove attribute if value is false. + @param key attribute key + @param value attribute value + */ + public void put(String key, boolean value) { + if (value) + put(new BooleanAttribute(key)); + else + remove(key); + } /** Set a new attribute, or replace an existing one by key. diff -Nru jsoup-1.8.2/src/main/java/org/jsoup/nodes/BooleanAttribute.java jsoup-1.8.3/src/main/java/org/jsoup/nodes/BooleanAttribute.java --- jsoup-1.8.2/src/main/java/org/jsoup/nodes/BooleanAttribute.java 1970-01-01 00:00:00.000000000 +0000 +++ jsoup-1.8.3/src/main/java/org/jsoup/nodes/BooleanAttribute.java 2015-08-02 20:18:18.000000000 +0000 @@ -0,0 +1,19 @@ +package org.jsoup.nodes; + +/** + * A boolean attribute that is written out without any value. + */ +public class BooleanAttribute extends Attribute { + /** + * Create a new boolean attribute from unencoded (raw) key. + * @param key attribute key + */ + public BooleanAttribute(String key) { + super(key, ""); + } + + @Override + protected boolean isBooleanAttribute() { + return true; + } +} diff -Nru jsoup-1.8.2/src/main/java/org/jsoup/nodes/Document.java jsoup-1.8.3/src/main/java/org/jsoup/nodes/Document.java --- jsoup-1.8.2/src/main/java/org/jsoup/nodes/Document.java 2015-04-14 04:32:24.000000000 +0000 +++ jsoup-1.8.3/src/main/java/org/jsoup/nodes/Document.java 2015-08-02 20:18:18.000000000 +0000 @@ -266,7 +266,7 @@ * @see #charset(java.nio.charset.Charset) */ public void updateMetaCharsetElement(boolean update) { - this.updateMetaCharset = true; + this.updateMetaCharset = update; } /** @@ -308,7 +308,7 @@ * */ private void ensureMetaCharsetElement() { - if (updateMetaCharset == true) { + if (updateMetaCharset) { OutputSettings.Syntax syntax = outputSettings().syntax(); if (syntax == OutputSettings.Syntax.html) { @@ -551,7 +551,7 @@ } public enum QuirksMode { - noQuirks, quirks, limitedQuirks; + noQuirks, quirks, limitedQuirks } public QuirksMode quirksMode() { @@ -562,10 +562,4 @@ this.quirksMode = quirksMode; return this; } - - @Override - public boolean equals(Object o) { - return super.equals(o); - } } - diff -Nru jsoup-1.8.2/src/main/java/org/jsoup/nodes/Element.java jsoup-1.8.3/src/main/java/org/jsoup/nodes/Element.java --- jsoup-1.8.2/src/main/java/org/jsoup/nodes/Element.java 2015-04-14 04:32:24.000000000 +0000 +++ jsoup-1.8.3/src/main/java/org/jsoup/nodes/Element.java 2015-08-02 20:18:18.000000000 +0000 @@ -116,6 +116,21 @@ super.attr(attributeKey, attributeValue); return this; } + + /** + * Set a boolean attribute value on this element. Setting to true sets the attribute value to "" and + * marks the attribute as boolean so no value is written out. Setting to false removes the attribute + * with the same key if it exists. + * + * @param attributeKey the attribute key + * @param attributeValue the attribute value + * + * @return this element + */ + public Element attr(String attributeKey, boolean attributeValue) { + attributes.put(attributeKey, attributeValue); + return this; + } /** * Get this element's HTML5 custom data attributes. Each attribute in the element that has a key @@ -269,6 +284,7 @@ // was - Node#addChildren(child). short-circuits an array create and a loop. reparentChild(child); + ensureChildNodes(); childNodes.add(child); child.setSiblingIndex(childNodes.size() - 1); return this; @@ -571,7 +587,7 @@ for (int i = 0; i < elements.size(); i++) { E element = elements.get(i); - if (element.equals(search)) + if (element == search) return i; } return null; diff -Nru jsoup-1.8.2/src/main/java/org/jsoup/nodes/Entities.java jsoup-1.8.3/src/main/java/org/jsoup/nodes/Entities.java --- jsoup-1.8.2/src/main/java/org/jsoup/nodes/Entities.java 2015-04-14 04:32:24.000000000 +0000 +++ jsoup-1.8.3/src/main/java/org/jsoup/nodes/Entities.java 2015-08-02 20:18:18.000000000 +0000 @@ -115,10 +115,11 @@ if (escapeMode != EscapeMode.xhtml) accum.append(" "); else - accum.append(c); + accum.append(" "); break; case '<': - if (!inAttribute) + // escape when in character data or when in a xml attribue val; not needed in html attr val + if (!inAttribute || escapeMode == EscapeMode.xhtml) accum.append("<"); else accum.append(c); diff -Nru jsoup-1.8.2/src/main/java/org/jsoup/nodes/FormElement.java jsoup-1.8.3/src/main/java/org/jsoup/nodes/FormElement.java --- jsoup-1.8.2/src/main/java/org/jsoup/nodes/FormElement.java 2015-04-14 04:32:24.000000000 +0000 +++ jsoup-1.8.3/src/main/java/org/jsoup/nodes/FormElement.java 2015-08-02 20:18:18.000000000 +0000 @@ -59,11 +59,9 @@ Connection.Method method = attr("method").toUpperCase().equals("POST") ? Connection.Method.POST : Connection.Method.GET; - Connection con = Jsoup.connect(action) + return Jsoup.connect(action) .data(formData()) .method(method); - - return con; } /** @@ -106,9 +104,4 @@ } return data; } - - @Override - public boolean equals(Object o) { - return super.equals(o); - } } diff -Nru jsoup-1.8.2/src/main/java/org/jsoup/nodes/Node.java jsoup-1.8.3/src/main/java/org/jsoup/nodes/Node.java --- jsoup-1.8.2/src/main/java/org/jsoup/nodes/Node.java 2015-04-14 04:32:24.000000000 +0000 +++ jsoup-1.8.3/src/main/java/org/jsoup/nodes/Node.java 2015-08-02 20:18:18.000000000 +0000 @@ -18,6 +18,7 @@ @author Jonathan Hedley, jonathan@hedley.net */ public abstract class Node implements Cloneable { + private static final List EMPTY_NODES = Collections.emptyList(); Node parentNode; List childNodes; Attributes attributes; @@ -33,7 +34,7 @@ Validate.notNull(baseUri); Validate.notNull(attributes); - childNodes = new ArrayList(4); + childNodes = EMPTY_NODES; this.baseUri = baseUri.trim(); this.attributes = attributes; } @@ -46,7 +47,7 @@ * Default constructor. Doesn't setup base uri, children, or attributes; use with caution. */ protected Node() { - childNodes = Collections.emptyList(); + childNodes = EMPTY_NODES; attributes = null; } @@ -178,27 +179,10 @@ public String absUrl(String attributeKey) { Validate.notEmpty(attributeKey); - String relUrl = attr(attributeKey); if (!hasAttr(attributeKey)) { return ""; // nothing to make absolute with } else { - URL base; - try { - try { - base = new URL(baseUri); - } catch (MalformedURLException e) { - // the base is unsuitable, but the attribute may be abs on its own, so try that - URL abs = new URL(relUrl); - return abs.toExternalForm(); - } - // workaround: java resolves '//path/file + ?foo' to '//path/?foo', not '//path/file?foo' as desired - if (relUrl.startsWith("?")) - relUrl = base.getPath() + relUrl; - URL abs = new URL(base, relUrl); - return abs.toExternalForm(); - } catch (MalformedURLException e) { - return ""; - } + return StringUtil.resolve(baseUri, attr(attributeKey)); } } @@ -445,6 +429,7 @@ //most used. short circuit addChildren(int), which hits reindex children and array copy for (Node child: children) { reparentChild(child); + ensureChildNodes(); childNodes.add(child); child.setSiblingIndex(childNodes.size()-1); } @@ -455,11 +440,18 @@ for (int i = children.length - 1; i >= 0; i--) { Node in = children[i]; reparentChild(in); + ensureChildNodes(); childNodes.add(index, in); } reindexChildren(index); } + protected void ensureChildNodes() { + if (childNodes == EMPTY_NODES) { + childNodes = new ArrayList(4); + } + } + protected void reparentChild(Node child) { if (child.parentNode != null) child.parentNode.removeChild(child); diff -Nru jsoup-1.8.2/src/main/java/org/jsoup/nodes/XmlDeclaration.java jsoup-1.8.3/src/main/java/org/jsoup/nodes/XmlDeclaration.java --- jsoup-1.8.2/src/main/java/org/jsoup/nodes/XmlDeclaration.java 2015-04-14 04:32:24.000000000 +0000 +++ jsoup-1.8.3/src/main/java/org/jsoup/nodes/XmlDeclaration.java 2015-08-02 20:18:18.000000000 +0000 @@ -31,7 +31,7 @@ public String getWholeDeclaration() { final String decl = attributes.get(DECL_KEY); - if( decl.equals("xml") == true && attributes.size() > 1 ) { + if(decl.equals("xml") && attributes.size() > 1 ) { StringBuilder sb = new StringBuilder(decl); final String version = attributes.get("version"); diff -Nru jsoup-1.8.2/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java jsoup-1.8.3/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java --- jsoup-1.8.2/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java 2015-04-14 04:32:24.000000000 +0000 +++ jsoup-1.8.3/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java 2015-08-02 20:18:18.000000000 +0000 @@ -11,7 +11,7 @@ /** * HTML Tree Builder; creates a DOM from Tokens. */ -class HtmlTreeBuilder extends TreeBuilder { +public class HtmlTreeBuilder extends TreeBuilder { // tag searches private static final String[] TagsScriptStyle = new String[]{"script", "style"}; public static final String[] TagsSearchInScope = new String[]{"applet", "caption", "html", "table", "td", "th", "marquee", "object"}; @@ -391,7 +391,7 @@ if ("select".equals(name)) { transition(HtmlTreeBuilderState.InSelect); break; // frag - } else if (("td".equals(name) || "td".equals(name) && !last)) { + } else if (("td".equals(name) || "th".equals(name) && !last)) { transition(HtmlTreeBuilderState.InCell); break; } else if ("tr".equals(name)) { diff -Nru jsoup-1.8.2/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java jsoup-1.8.3/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java --- jsoup-1.8.2/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java 2015-04-14 04:32:24.000000000 +0000 +++ jsoup-1.8.3/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java 2015-08-02 20:18:18.000000000 +0000 @@ -271,7 +271,51 @@ case StartTag: Token.StartTag startTag = t.asStartTag(); String name = startTag.name(); - if (name.equals("html")) { + if (name.equals("a")) { + if (tb.getActiveFormattingElement("a") != null) { + tb.error(this); + tb.processEndTag("a"); + + // still on stack? + Element remainingA = tb.getFromStack("a"); + if (remainingA != null) { + tb.removeFromActiveFormattingElements(remainingA); + tb.removeFromStack(remainingA); + } + } + tb.reconstructFormattingElements(); + Element a = tb.insert(startTag); + tb.pushActiveFormattingElements(a); + } else if (StringUtil.inSorted(name, Constants.InBodyStartEmptyFormatters)) { + tb.reconstructFormattingElements(); + tb.insertEmpty(startTag); + tb.framesetOk(false); + } else if (StringUtil.inSorted(name, Constants.InBodyStartPClosers)) { + if (tb.inButtonScope("p")) { + tb.processEndTag("p"); + } + tb.insert(startTag); + } else if (name.equals("span")) { + // same as final else, but short circuits lots of checks + tb.reconstructFormattingElements(); + tb.insert(startTag); + } else if (name.equals("li")) { + tb.framesetOk(false); + ArrayList stack = tb.getStack(); + for (int i = stack.size() - 1; i > 0; i--) { + Element el = stack.get(i); + if (el.nodeName().equals("li")) { + tb.processEndTag("li"); + break; + } + if (tb.isSpecial(el) && !StringUtil.inSorted(el.nodeName(), Constants.InBodyStartLiBreakers)) + break; + } + if (tb.inButtonScope("p")) { + tb.processEndTag("p"); + } + tb.insert(startTag); + } else if (name.equals("html")) { tb.error(this); // merge attributes onto real html Element html = tb.getStack().get(0); @@ -279,7 +323,7 @@ if (!html.hasAttr(attribute.getKey())) html.attributes().put(attribute); } - } else if (StringUtil.in(name, Constants.InBodyStartToHead)) { + } else if (StringUtil.inSorted(name, Constants.InBodyStartToHead)) { return tb.process(t, InHead); } else if (name.equals("body")) { tb.error(this); @@ -313,21 +357,16 @@ tb.insert(startTag); tb.transition(InFrameset); } - } else if (StringUtil.in(name, Constants.InBodyStartPClosers)) { + } else if (StringUtil.inSorted(name, Constants.Headings)) { if (tb.inButtonScope("p")) { tb.processEndTag("p"); } - tb.insert(startTag); - } else if (StringUtil.in(name, Constants.Headings)) { - if (tb.inButtonScope("p")) { - tb.processEndTag("p"); - } - if (StringUtil.in(tb.currentElement().nodeName(), Constants.Headings)) { + if (StringUtil.inSorted(tb.currentElement().nodeName(), Constants.Headings)) { tb.error(this); tb.pop(); } tb.insert(startTag); - } else if (StringUtil.in(name, Constants.InBodyStartPreListing)) { + } else if (StringUtil.inSorted(name, Constants.InBodyStartPreListing)) { if (tb.inButtonScope("p")) { tb.processEndTag("p"); } @@ -343,32 +382,16 @@ tb.processEndTag("p"); } tb.insertForm(startTag, true); - } else if (name.equals("li")) { - tb.framesetOk(false); - ArrayList stack = tb.getStack(); - for (int i = stack.size() - 1; i > 0; i--) { - Element el = stack.get(i); - if (el.nodeName().equals("li")) { - tb.processEndTag("li"); - break; - } - if (tb.isSpecial(el) && !StringUtil.in(el.nodeName(), Constants.InBodyStartLiBreakers)) - break; - } - if (tb.inButtonScope("p")) { - tb.processEndTag("p"); - } - tb.insert(startTag); - } else if (StringUtil.in(name, Constants.DdDt)) { + } else if (StringUtil.inSorted(name, Constants.DdDt)) { tb.framesetOk(false); ArrayList stack = tb.getStack(); for (int i = stack.size() - 1; i > 0; i--) { Element el = stack.get(i); - if (StringUtil.in(el.nodeName(), Constants.DdDt)) { + if (StringUtil.inSorted(el.nodeName(), Constants.DdDt)) { tb.processEndTag(el.nodeName()); break; } - if (tb.isSpecial(el) && !StringUtil.in(el.nodeName(), Constants.InBodyStartLiBreakers)) + if (tb.isSpecial(el) && !StringUtil.inSorted(el.nodeName(), Constants.InBodyStartLiBreakers)) break; } if (tb.inButtonScope("p")) { @@ -392,22 +415,7 @@ tb.insert(startTag); tb.framesetOk(false); } - } else if (name.equals("a")) { - if (tb.getActiveFormattingElement("a") != null) { - tb.error(this); - tb.processEndTag("a"); - - // still on stack? - Element remainingA = tb.getFromStack("a"); - if (remainingA != null) { - tb.removeFromActiveFormattingElements(remainingA); - tb.removeFromStack(remainingA); - } - } - tb.reconstructFormattingElements(); - Element a = tb.insert(startTag); - tb.pushActiveFormattingElements(a); - } else if (StringUtil.in(name, Constants.Formatters)) { + } else if (StringUtil.inSorted(name, Constants.Formatters)) { tb.reconstructFormattingElements(); Element el = tb.insert(startTag); tb.pushActiveFormattingElements(el); @@ -420,7 +428,7 @@ } Element el = tb.insert(startTag); tb.pushActiveFormattingElements(el); - } else if (StringUtil.in(name, Constants.InBodyStartApplets)) { + } else if (StringUtil.inSorted(name, Constants.InBodyStartApplets)) { tb.reconstructFormattingElements(); tb.insert(startTag); tb.insertMarkerToFormattingElements(); @@ -432,16 +440,12 @@ tb.insert(startTag); tb.framesetOk(false); tb.transition(InTable); - } else if (StringUtil.in(name, Constants.InBodyStartEmptyFormatters)) { - tb.reconstructFormattingElements(); - tb.insertEmpty(startTag); - tb.framesetOk(false); } else if (name.equals("input")) { tb.reconstructFormattingElements(); Element el = tb.insertEmpty(startTag); if (!el.attr("type").equalsIgnoreCase("hidden")) tb.framesetOk(false); - } else if (StringUtil.in(name, Constants.InBodyStartMedia)) { + } else if (StringUtil.inSorted(name, Constants.InBodyStartMedia)) { tb.insertEmpty(startTag); } else if (name.equals("hr")) { if (tb.inButtonScope("p")) { @@ -478,7 +482,7 @@ // input Attributes inputAttribs = new Attributes(); for (Attribute attr : startTag.attributes) { - if (!StringUtil.in(attr.getKey(), Constants.InBodyStartInputAttribs)) + if (!StringUtil.inSorted(attr.getKey(), Constants.InBodyStartInputAttribs)) inputAttribs.put(attr); } inputAttribs.put("name", "isindex"); @@ -516,12 +520,12 @@ tb.transition(InSelectInTable); else tb.transition(InSelect); - } else if (StringUtil.in(name, Constants.InBodyStartOptions)) { + } else if (StringUtil.inSorted(name, Constants.InBodyStartOptions)) { if (tb.currentElement().nodeName().equals("option")) tb.processEndTag("option"); tb.reconstructFormattingElements(); tb.insert(startTag); - } else if (StringUtil.in(name, Constants.InBodyStartRuby)) { + } else if (StringUtil.inSorted(name, Constants.InBodyStartRuby)) { if (tb.inScope("ruby")) { tb.generateImpliedEndTags(); if (!tb.currentElement().nodeName().equals("ruby")) { @@ -540,7 +544,7 @@ // todo: handle A start tag whose tag name is "svg" (xlink, svg) tb.insert(startTag); tb.tokeniser.acknowledgeSelfClosingFlag(); - } else if (StringUtil.in(name, Constants.InBodyStartDrop)) { + } else if (StringUtil.inSorted(name, Constants.InBodyStartDrop)) { tb.error(this); return false; } else { @@ -552,87 +556,7 @@ case EndTag: Token.EndTag endTag = t.asEndTag(); name = endTag.name(); - if (name.equals("body")) { - if (!tb.inScope("body")) { - tb.error(this); - return false; - } else { - // todo: error if stack contains something not dd, dt, li, optgroup, option, p, rp, rt, tbody, td, tfoot, th, thead, tr, body, html - tb.transition(AfterBody); - } - } else if (name.equals("html")) { - boolean notIgnored = tb.processEndTag("body"); - if (notIgnored) - return tb.process(endTag); - } else if (StringUtil.in(name, Constants.InBodyEndClosers)) { - if (!tb.inScope(name)) { - // nothing to close - tb.error(this); - return false; - } else { - tb.generateImpliedEndTags(); - if (!tb.currentElement().nodeName().equals(name)) - tb.error(this); - tb.popStackToClose(name); - } - } else if (name.equals("form")) { - Element currentForm = tb.getFormElement(); - tb.setFormElement(null); - if (currentForm == null || !tb.inScope(name)) { - tb.error(this); - return false; - } else { - tb.generateImpliedEndTags(); - if (!tb.currentElement().nodeName().equals(name)) - tb.error(this); - // remove currentForm from stack. will shift anything under up. - tb.removeFromStack(currentForm); - } - } else if (name.equals("p")) { - if (!tb.inButtonScope(name)) { - tb.error(this); - tb.processStartTag(name); // if no p to close, creates an empty

- return tb.process(endTag); - } else { - tb.generateImpliedEndTags(name); - if (!tb.currentElement().nodeName().equals(name)) - tb.error(this); - tb.popStackToClose(name); - } - } else if (name.equals("li")) { - if (!tb.inListItemScope(name)) { - tb.error(this); - return false; - } else { - tb.generateImpliedEndTags(name); - if (!tb.currentElement().nodeName().equals(name)) - tb.error(this); - tb.popStackToClose(name); - } - } else if (StringUtil.in(name, Constants.DdDt)) { - if (!tb.inScope(name)) { - tb.error(this); - return false; - } else { - tb.generateImpliedEndTags(name); - if (!tb.currentElement().nodeName().equals(name)) - tb.error(this); - tb.popStackToClose(name); - } - } else if (StringUtil.in(name, Constants.Headings)) { - if (!tb.inScope(Constants.Headings)) { - tb.error(this); - return false; - } else { - tb.generateImpliedEndTags(name); - if (!tb.currentElement().nodeName().equals(name)) - tb.error(this); - tb.popStackToClose(Constants.Headings); - } - } else if (name.equals("sarcasm")) { - // *sigh* - return anyOtherEndTag(t, tb); - } else if (StringUtil.in(name, Constants.InBodyEndAdoptionFormatters)) { + if (StringUtil.inSorted(name, Constants.InBodyEndAdoptionFormatters)) { // Adoption Agency Algorithm. for (int i = 0; i < 8; i++) { Element formatEl = tb.getActiveFormattingElement(name); @@ -700,7 +624,7 @@ lastNode = node; } - if (StringUtil.in(commonAncestor.nodeName(), Constants.InBodyEndTableFosters)) { + if (StringUtil.inSorted(commonAncestor.nodeName(), Constants.InBodyEndTableFosters)) { if (lastNode.parent() != null) lastNode.remove(); tb.insertInFosterParent(lastNode); @@ -722,7 +646,90 @@ tb.removeFromStack(formatEl); tb.insertOnStackAfter(furthestBlock, adopter); } - } else if (StringUtil.in(name, Constants.InBodyStartApplets)) { + } else if (StringUtil.inSorted(name, Constants.InBodyEndClosers)) { + if (!tb.inScope(name)) { + // nothing to close + tb.error(this); + return false; + } else { + tb.generateImpliedEndTags(); + if (!tb.currentElement().nodeName().equals(name)) + tb.error(this); + tb.popStackToClose(name); + } + } else if (name.equals("span")) { + // same as final fall through, but saves short circuit + return anyOtherEndTag(t, tb); + } else if (name.equals("li")) { + if (!tb.inListItemScope(name)) { + tb.error(this); + return false; + } else { + tb.generateImpliedEndTags(name); + if (!tb.currentElement().nodeName().equals(name)) + tb.error(this); + tb.popStackToClose(name); + } + } else if (name.equals("body")) { + if (!tb.inScope("body")) { + tb.error(this); + return false; + } else { + // todo: error if stack contains something not dd, dt, li, optgroup, option, p, rp, rt, tbody, td, tfoot, th, thead, tr, body, html + tb.transition(AfterBody); + } + } else if (name.equals("html")) { + boolean notIgnored = tb.processEndTag("body"); + if (notIgnored) + return tb.process(endTag); + } else if (name.equals("form")) { + Element currentForm = tb.getFormElement(); + tb.setFormElement(null); + if (currentForm == null || !tb.inScope(name)) { + tb.error(this); + return false; + } else { + tb.generateImpliedEndTags(); + if (!tb.currentElement().nodeName().equals(name)) + tb.error(this); + // remove currentForm from stack. will shift anything under up. + tb.removeFromStack(currentForm); + } + } else if (name.equals("p")) { + if (!tb.inButtonScope(name)) { + tb.error(this); + tb.processStartTag(name); // if no p to close, creates an empty

+ return tb.process(endTag); + } else { + tb.generateImpliedEndTags(name); + if (!tb.currentElement().nodeName().equals(name)) + tb.error(this); + tb.popStackToClose(name); + } + } else if (StringUtil.inSorted(name, Constants.DdDt)) { + if (!tb.inScope(name)) { + tb.error(this); + return false; + } else { + tb.generateImpliedEndTags(name); + if (!tb.currentElement().nodeName().equals(name)) + tb.error(this); + tb.popStackToClose(name); + } + } else if (StringUtil.inSorted(name, Constants.Headings)) { + if (!tb.inScope(Constants.Headings)) { + tb.error(this); + return false; + } else { + tb.generateImpliedEndTags(name); + if (!tb.currentElement().nodeName().equals(name)) + tb.error(this); + tb.popStackToClose(Constants.Headings); + } + } else if (name.equals("sarcasm")) { + // *sigh* + return anyOtherEndTag(t, tb); + } else if (StringUtil.inSorted(name, Constants.InBodyStartApplets)) { if (!tb.inScope("name")) { if (!tb.inScope(name)) { tb.error(this); diff -Nru jsoup-1.8.2/src/main/java/org/jsoup/parser/Parser.java jsoup-1.8.3/src/main/java/org/jsoup/parser/Parser.java --- jsoup-1.8.2/src/main/java/org/jsoup/parser/Parser.java 2015-04-14 04:32:24.000000000 +0000 +++ jsoup-1.8.3/src/main/java/org/jsoup/parser/Parser.java 2015-08-02 20:18:18.000000000 +0000 @@ -27,8 +27,7 @@ public Document parseInput(String html, String baseUri) { errors = isTrackErrors() ? ParseErrorList.tracking(maxErrors) : ParseErrorList.noTracking(); - Document doc = treeBuilder.parse(html, baseUri, errors); - return doc; + return treeBuilder.parse(html, baseUri, errors); } // gets & sets diff -Nru jsoup-1.8.2/src/main/java/org/jsoup/parser/Tag.java jsoup-1.8.3/src/main/java/org/jsoup/parser/Tag.java --- jsoup-1.8.2/src/main/java/org/jsoup/parser/Tag.java 2015-04-14 04:32:24.000000000 +0000 +++ jsoup-1.8.3/src/main/java/org/jsoup/parser/Tag.java 2015-08-02 20:18:18.000000000 +0000 @@ -193,9 +193,7 @@ if (preserveWhitespace != tag.preserveWhitespace) return false; if (selfClosing != tag.selfClosing) return false; if (formList != tag.formList) return false; - if (formSubmit != tag.formSubmit) return false; - - return true; + return formSubmit == tag.formSubmit; } @Override diff -Nru jsoup-1.8.2/src/main/java/org/jsoup/parser/TokeniserState.java jsoup-1.8.3/src/main/java/org/jsoup/parser/TokeniserState.java --- jsoup-1.8.2/src/main/java/org/jsoup/parser/TokeniserState.java 2015-04-14 04:32:24.000000000 +0000 +++ jsoup-1.8.3/src/main/java/org/jsoup/parser/TokeniserState.java 2015-08-02 20:18:18.000000000 +0000 @@ -757,6 +757,7 @@ break; case eof: t.eofError(this); + t.emitTagPending(); t.transition(Data); break; case '>': @@ -782,6 +783,8 @@ String value = r.consumeToAnySorted(attributeDoubleValueCharsSorted); if (value.length() > 0) t.tagPending.appendAttributeValue(value); + else + t.tagPending.setEmptyAttributeValue(); char c = r.consume(); switch (c) { @@ -812,6 +815,8 @@ String value = r.consumeToAnySorted(attributeSingleValueCharsSorted); if (value.length() > 0) t.tagPending.appendAttributeValue(value); + else + t.tagPending.setEmptyAttributeValue(); char c = r.consume(); switch (c) { diff -Nru jsoup-1.8.2/src/main/java/org/jsoup/parser/Token.java jsoup-1.8.3/src/main/java/org/jsoup/parser/Token.java --- jsoup-1.8.2/src/main/java/org/jsoup/parser/Token.java 2015-04-14 04:32:24.000000000 +0000 +++ jsoup-1.8.3/src/main/java/org/jsoup/parser/Token.java 2015-08-02 20:18:18.000000000 +0000 @@ -3,6 +3,7 @@ import org.jsoup.helper.Validate; import org.jsoup.nodes.Attribute; import org.jsoup.nodes.Attributes; +import org.jsoup.nodes.BooleanAttribute; /** * Parse tokens for the Tokeniser. @@ -69,6 +70,7 @@ protected String tagName; private String pendingAttributeName; // attribute names are generally caught in one hop, not accumulated private StringBuilder pendingAttributeValue = new StringBuilder(); // but values are accumulated, from e.g. & in hrefs + private boolean hasEmptyAttributeValue = false; // distinguish boolean attribute from empty string value private boolean hasPendingAttributeValue = false; boolean selfClosing = false; Attributes attributes; // start tags get attributes on construction. End tags get attributes on first new attribute (but only for parser convenience, not used). @@ -78,6 +80,7 @@ tagName = null; pendingAttributeName = null; reset(pendingAttributeValue); + hasEmptyAttributeValue = false; hasPendingAttributeValue = false; selfClosing = false; attributes = null; @@ -90,13 +93,17 @@ if (pendingAttributeName != null) { Attribute attribute; - if (!hasPendingAttributeValue) + if (hasPendingAttributeValue) + attribute = new Attribute(pendingAttributeName, pendingAttributeValue.toString()); + else if (hasEmptyAttributeValue) attribute = new Attribute(pendingAttributeName, ""); else - attribute = new Attribute(pendingAttributeName, pendingAttributeValue.toString()); + attribute = new BooleanAttribute(pendingAttributeName); attributes.put(attribute); } pendingAttributeName = null; + hasEmptyAttributeValue = false; + hasPendingAttributeValue = false; reset(pendingAttributeValue); } @@ -158,6 +165,10 @@ ensureAttributeValue(); pendingAttributeValue.append(append); } + + final void setEmptyAttributeValue() { + hasEmptyAttributeValue = true; + } private void ensureAttributeValue() { hasPendingAttributeValue = true; @@ -315,7 +326,7 @@ return type == TokenType.EOF; } - static enum TokenType { + enum TokenType { Doctype, StartTag, EndTag, diff -Nru jsoup-1.8.2/src/main/java/org/jsoup/parser/TokenQueue.java jsoup-1.8.3/src/main/java/org/jsoup/parser/TokenQueue.java --- jsoup-1.8.2/src/main/java/org/jsoup/parser/TokenQueue.java 2015-04-14 04:32:24.000000000 +0000 +++ jsoup-1.8.3/src/main/java/org/jsoup/parser/TokenQueue.java 2015-08-02 20:18:18.000000000 +0000 @@ -209,8 +209,7 @@ pos++; } - String data = queue.substring(start, pos); - return data; + return queue.substring(start, pos); } /** @@ -226,8 +225,7 @@ pos++; } - String data = queue.substring(start, pos); - return data; + return queue.substring(start, pos); } /** diff -Nru jsoup-1.8.2/src/main/java/org/jsoup/parser/TreeBuilder.java jsoup-1.8.3/src/main/java/org/jsoup/parser/TreeBuilder.java --- jsoup-1.8.2/src/main/java/org/jsoup/parser/TreeBuilder.java 2015-04-14 04:32:24.000000000 +0000 +++ jsoup-1.8.3/src/main/java/org/jsoup/parser/TreeBuilder.java 2015-08-02 20:18:18.000000000 +0000 @@ -58,16 +58,25 @@ protected abstract boolean process(Token token); protected boolean processStartTag(String name) { + if (currentToken == start) { // don't recycle an in-use token + return process(new Token.StartTag().name(name)); + } return process(start.reset().name(name)); } public boolean processStartTag(String name, Attributes attrs) { + if (currentToken == start) { // don't recycle an in-use token + return process(new Token.StartTag().nameAttr(name, attrs)); + } start.reset(); start.nameAttr(name, attrs); return process(start); } protected boolean processEndTag(String name) { + if (currentToken == end) { // don't recycle an in-use token + return process(new Token.EndTag().name(name)); + } return process(end.reset().name(name)); } diff -Nru jsoup-1.8.2/src/main/java/org/jsoup/select/NodeVisitor.java jsoup-1.8.3/src/main/java/org/jsoup/select/NodeVisitor.java --- jsoup-1.8.2/src/main/java/org/jsoup/select/NodeVisitor.java 2015-04-14 04:32:24.000000000 +0000 +++ jsoup-1.8.3/src/main/java/org/jsoup/select/NodeVisitor.java 2015-08-02 20:18:18.000000000 +0000 @@ -18,7 +18,7 @@ * @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node * of that will have depth 1. */ - public void head(Node node, int depth); + void head(Node node, int depth); /** * Callback for when a node is last visited, after all of its descendants have been visited. @@ -27,5 +27,5 @@ * @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node * of that will have depth 1. */ - public void tail(Node node, int depth); + void tail(Node node, int depth); } diff -Nru jsoup-1.8.2/src/main/java/org/jsoup/select/Selector.java jsoup-1.8.3/src/main/java/org/jsoup/select/Selector.java --- jsoup-1.8.2/src/main/java/org/jsoup/select/Selector.java 2015-04-14 04:32:24.000000000 +0000 +++ jsoup-1.8.3/src/main/java/org/jsoup/select/Selector.java 2015-08-02 20:18:18.000000000 +0000 @@ -41,7 +41,7 @@ * E ~ Fan F element preceded by sibling Eh1 ~ p * E, F, Gall matching elements E, F, or Ga[href], div, h3 *

Pseudo selectors

- * :lt(n)elements whose sibling index is less than ntd:lt(3) finds the first 2 cells of each row + * :lt(n)elements whose sibling index is less than ntd:lt(3) finds the first 3 cells of each row * :gt(n)elements whose sibling index is greater than ntd:gt(1) finds cells after skipping the first two * :eq(n)elements whose sibling index is equal to ntd:eq(0) finds the first cell of each row * :has(selector)elements that contains at least one element matching the selectordiv:has(p) finds divs that contain p elements diff -Nru jsoup-1.8.2/src/test/java/org/jsoup/helper/StringUtilTest.java jsoup-1.8.3/src/test/java/org/jsoup/helper/StringUtilTest.java --- jsoup-1.8.2/src/test/java/org/jsoup/helper/StringUtilTest.java 2015-04-14 04:32:24.000000000 +0000 +++ jsoup-1.8.3/src/test/java/org/jsoup/helper/StringUtilTest.java 2015-08-02 20:18:18.000000000 +0000 @@ -5,6 +5,7 @@ import java.util.Arrays; +import static org.jsoup.helper.StringUtil.*; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; @@ -58,18 +59,35 @@ } @Test public void normaliseWhiteSpace() { - assertEquals(" ", StringUtil.normaliseWhitespace(" \r \n \r\n")); - assertEquals(" hello there ", StringUtil.normaliseWhitespace(" hello \r \n there \n")); - assertEquals("hello", StringUtil.normaliseWhitespace("hello")); - assertEquals("hello there", StringUtil.normaliseWhitespace("hello\nthere")); + assertEquals(" ", normaliseWhitespace(" \r \n \r\n")); + assertEquals(" hello there ", normaliseWhitespace(" hello \r \n there \n")); + assertEquals("hello", normaliseWhitespace("hello")); + assertEquals("hello there", normaliseWhitespace("hello\nthere")); } @Test public void normaliseWhiteSpaceHandlesHighSurrogates() { String test71540chars = "\ud869\udeb2\u304b\u309a 1"; String test71540charsExpectedSingleWhitespace = "\ud869\udeb2\u304b\u309a 1"; - assertEquals(test71540charsExpectedSingleWhitespace, StringUtil.normaliseWhitespace(test71540chars)); + assertEquals(test71540charsExpectedSingleWhitespace, normaliseWhitespace(test71540chars)); String extractedText = Jsoup.parse(test71540chars).text(); assertEquals(test71540charsExpectedSingleWhitespace, extractedText); } + + @Test public void resolvesRelativeUrls() { + assertEquals("http://example.com/one/two?three", resolve("http://example.com", "./one/two?three")); + assertEquals("http://example.com/one/two?three", resolve("http://example.com?one", "./one/two?three")); + assertEquals("http://example.com/one/two?three#four", resolve("http://example.com", "./one/two?three#four")); + assertEquals("https://example.com/one", resolve("http://example.com/", "https://example.com/one")); + assertEquals("http://example.com/one/two.html", resolve("http://example.com/two/", "../one/two.html")); + assertEquals("https://example2.com/one", resolve("https://example.com/", "//example2.com/one")); + assertEquals("https://example.com:8080/one", resolve("https://example.com:8080", "./one")); + assertEquals("https://example2.com/one", resolve("http://example.com/", "https://example2.com/one")); + assertEquals("https://example.com/one", resolve("wrong", "https://example.com/one")); + assertEquals("https://example.com/one", resolve("https://example.com/one", "")); + assertEquals("", resolve("wrong", "also wrong")); + assertEquals("ftp://example.com/one", resolve("ftp://example.com/two/", "../one")); + assertEquals("ftp://example.com/one/two.c", resolve("ftp://example.com/one/", "./two.c")); + assertEquals("ftp://example.com/one/two.c", resolve("ftp://example.com/one/", "two.c")); + } } diff -Nru jsoup-1.8.2/src/test/java/org/jsoup/integration/UrlConnectTest.java jsoup-1.8.3/src/test/java/org/jsoup/integration/UrlConnectTest.java --- jsoup-1.8.2/src/test/java/org/jsoup/integration/UrlConnectTest.java 2015-04-14 04:32:24.000000000 +0000 +++ jsoup-1.8.3/src/test/java/org/jsoup/integration/UrlConnectTest.java 2015-08-02 20:18:18.000000000 +0000 @@ -4,9 +4,13 @@ import org.jsoup.HttpStatusException; import org.jsoup.Jsoup; import org.jsoup.UnsupportedMimeTypeException; +import org.jsoup.helper.StringUtil; import org.jsoup.helper.W3CDom; import org.jsoup.nodes.Document; import org.jsoup.nodes.FormElement; +import org.jsoup.parser.HtmlTreeBuilder; +import org.jsoup.parser.Parser; +import org.jsoup.parser.XmlTreeBuilder; import org.junit.Ignore; import org.junit.Test; @@ -173,12 +177,30 @@ @Test public void followsRelativeRedirect() throws IOException { - Connection con = Jsoup.connect("http://direct.infohound.net/tools/302-rel.pl"); // to ./ - /tools/ + Connection con = Jsoup.connect("http://direct.infohound.net/tools/302-rel.pl"); // to /tidy/ Document doc = con.post(); assertTrue(doc.title().contains("HTML Tidy Online")); } @Test + public void followsRelativeDotRedirect() throws IOException { + // redirects to "./ok.html", should resolve to http://direct.infohound.net/tools/ok.html + Connection con = Jsoup.connect("http://direct.infohound.net/tools/302-rel-dot.pl"); // to ./ok.html + Document doc = con.post(); + assertTrue(doc.title().contains("OK")); + assertEquals(doc.location(), "http://direct.infohound.net/tools/ok.html"); + } + + @Test + public void followsRelativeDotRedirect2() throws IOException { + //redirects to "esportspenedes.cat/./ep/index.php", should resolve to "esportspenedes.cat/ep/index.php" + Connection con = Jsoup.connect("http://esportspenedes.cat") // note lack of trailing / - server should redir to / first, then to ./ep/...; but doesn't' + .timeout(10000); + Document doc = con.post(); + assertEquals(doc.location(), "http://esportspenedes.cat/ep/index.php"); + } + + @Test public void followsRedirectsWithWithespaces() throws IOException { Connection con = Jsoup.connect("http://tinyurl.com/kgofxl8"); // to http://www.google.com/?q=white spaces Document doc = con.get(); @@ -494,4 +516,26 @@ assertTrue(html.contains("jsoup")); } + @Test + public void fetchHandlesXml() throws IOException { + // should auto-detect xml and use XML parser, unless explicitly requested the html parser + String xmlUrl = "http://direct.infohound.net/tools/parse-xml.xml"; + Connection con = Jsoup.connect(xmlUrl); + Document doc = con.get(); + Connection.Request req = con.request(); + assertTrue(req.parser().getTreeBuilder() instanceof XmlTreeBuilder); + assertEquals(" one Two
", StringUtil.normaliseWhitespace(doc.outerHtml())); + } + + @Test + public void fetchHandlesXmlAsHtmlWhenParserSet() throws IOException { + // should auto-detect xml and use XML parser, unless explicitly requested the html parser + String xmlUrl = "http://direct.infohound.net/tools/parse-xml.xml"; + Connection con = Jsoup.connect(xmlUrl).parser(Parser.htmlParser()); + Document doc = con.get(); + Connection.Request req = con.request(); + assertTrue(req.parser().getTreeBuilder() instanceof HtmlTreeBuilder); + assertEquals(" one Two
", StringUtil.normaliseWhitespace(doc.outerHtml())); + } + } diff -Nru jsoup-1.8.2/src/test/java/org/jsoup/nodes/DocumentTest.java jsoup-1.8.3/src/test/java/org/jsoup/nodes/DocumentTest.java --- jsoup-1.8.2/src/test/java/org/jsoup/nodes/DocumentTest.java 2015-04-14 04:32:24.000000000 +0000 +++ jsoup-1.8.3/src/test/java/org/jsoup/nodes/DocumentTest.java 2015-08-02 20:18:18.000000000 +0000 @@ -1,7 +1,9 @@ package org.jsoup.nodes; +import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; +import java.io.InputStream; import java.nio.charset.Charset; import org.jsoup.Jsoup; import org.jsoup.TextUtil; @@ -382,4 +384,27 @@ return doc; } + + @Test + public void testShiftJisRoundtrip() throws Exception { + String input = + "" + + "" + + "" + + "" + + "" + + "before after" + + "" + + ""; + InputStream is = new ByteArrayInputStream(input.getBytes(Charset.forName("ASCII"))); + + Document doc = Jsoup.parse(is, null, "http://example.com"); + doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml); + + String output = new String(doc.html().getBytes(doc.outputSettings().charset()), doc.outputSettings().charset()); + + assertFalse("Should not have contained a '?'.", output.contains("?")); + assertTrue("Should have contained a ' ' or a ' '.", + output.contains(" ") || output.contains(" ")); + } } diff -Nru jsoup-1.8.2/src/test/java/org/jsoup/nodes/ElementTest.java jsoup-1.8.3/src/test/java/org/jsoup/nodes/ElementTest.java --- jsoup-1.8.2/src/test/java/org/jsoup/nodes/ElementTest.java 2015-04-14 04:32:24.000000000 +0000 +++ jsoup-1.8.3/src/test/java/org/jsoup/nodes/ElementTest.java 2015-08-02 20:18:18.000000000 +0000 @@ -117,6 +117,18 @@ assertEquals("element", p.lastElementSibling().text()); } + @Test public void testGetSiblingsWithDuplicateContent() { + Document doc = Jsoup.parse("

Hello

there

this

this

is

an

element

"); + Element p = doc.getElementById("1"); + assertEquals("there", p.text()); + assertEquals("Hello", p.previousElementSibling().text()); + assertEquals("this", p.nextElementSibling().text()); + assertEquals("this", p.nextElementSibling().nextElementSibling().text()); + assertEquals("is", p.nextElementSibling().nextElementSibling().nextElementSibling().text()); + assertEquals("Hello", p.firstElementSibling().text()); + assertEquals("element", p.lastElementSibling().text()); + } + @Test public void testGetParents() { Document doc = Jsoup.parse("

Hello there

"); Element span = doc.select("span").first(); @@ -137,6 +149,14 @@ assertTrue(2 == ps.get(2).elementSiblingIndex()); } + @Test public void testElementSiblingIndexSameContent() { + Document doc = Jsoup.parse("

One

...

One

...

One

"); + Elements ps = doc.select("p"); + assertTrue(0 == ps.get(0).elementSiblingIndex()); + assertTrue(1 == ps.get(1).elementSiblingIndex()); + assertTrue(2 == ps.get(2).elementSiblingIndex()); + } + @Test public void testGetElementsWithClass() { Document doc = Jsoup.parse("
Hello Yellow!

Empty

"); @@ -294,6 +314,26 @@ assertEquals(i, ps.get(i).siblingIndex); } } + + @Test public void testAddBooleanAttribute() { + Element div = new Element(Tag.valueOf("div"), ""); + + div.attr("true", true); + + div.attr("false", "value"); + div.attr("false", false); + + assertTrue(div.hasAttr("true")); + assertEquals("", div.attr("true")); + + List attributes = div.attributes().asList(); + assertEquals("There should be one attribute", 1, attributes.size()); + assertTrue("Attribute should be boolean", attributes.get(0) instanceof BooleanAttribute); + + assertFalse(div.hasAttr("false")); + + assertEquals("
", div.outerHtml()); + } @Test public void testAppendRowToTable() { Document doc = Jsoup.parse("
1
"); @@ -798,4 +838,16 @@ assertFalse(e0.hashCode() == (e6).hashCode()); assertFalse(e0.hashCode() == (e7).hashCode()); } + + @Test public void testRelativeUrls() { + String html = "One two Three Four Five"; + Document doc = Jsoup.parse(html, "http://example.com/bar/"); + Elements els = doc.select("a"); + + assertEquals("http://example.com/bar/one.html", els.get(0).absUrl("href")); + assertEquals("http://example.com/bar/two.html", els.get(1).absUrl("href")); + assertEquals("http://example.com/three.html", els.get(2).absUrl("href")); + assertEquals("http://example2.com/four/", els.get(3).absUrl("href")); + assertEquals("https://example2.com/five/", els.get(4).absUrl("href")); + } } diff -Nru jsoup-1.8.2/src/test/java/org/jsoup/nodes/EntitiesTest.java jsoup-1.8.3/src/test/java/org/jsoup/nodes/EntitiesTest.java --- jsoup-1.8.2/src/test/java/org/jsoup/nodes/EntitiesTest.java 2015-04-14 04:32:24.000000000 +0000 +++ jsoup-1.8.3/src/test/java/org/jsoup/nodes/EntitiesTest.java 2015-08-02 20:18:18.000000000 +0000 @@ -14,7 +14,7 @@ String escapedAscii = Entities.escape(text, new OutputSettings().charset("ascii").escapeMode(base)); String escapedAsciiFull = Entities.escape(text, new OutputSettings().charset("ascii").escapeMode(extended)); String escapedAsciiXhtml = Entities.escape(text, new OutputSettings().charset("ascii").escapeMode(xhtml)); - String escapedUtfFull = Entities.escape(text, new OutputSettings().charset("UTF-8").escapeMode(base)); + String escapedUtfFull = Entities.escape(text, new OutputSettings().charset("UTF-8").escapeMode(extended)); String escapedUtfMin = Entities.escape(text, new OutputSettings().charset("UTF-8").escapeMode(xhtml)); assertEquals("Hello &<> Å å π 新 there ¾ © »", escapedAscii); @@ -86,4 +86,19 @@ String string = "http://www.foo.com?a=1&num_rooms=1&children=0&int=VA&b=2"; assertEquals(string, Entities.unescape(string)); } + + @Test public void escapesGtInXmlAttributesButNotInHtml() { + // https://github.com/jhy/jsoup/issues/528 - < is OK in HTML attribute values, but not in XML + + + String docHtml = "One"; + Document doc = Jsoup.parse(docHtml); + Element element = doc.select("a").first(); + + doc.outputSettings().escapeMode(base); + assertEquals("One

\">One
", element.outerHtml()); + + doc.outputSettings().escapeMode(xhtml); + assertEquals("One</p>\">One", element.outerHtml()); + } } diff -Nru jsoup-1.8.2/src/test/java/org/jsoup/nodes/FormElementTest.java jsoup-1.8.3/src/test/java/org/jsoup/nodes/FormElementTest.java --- jsoup-1.8.2/src/test/java/org/jsoup/nodes/FormElementTest.java 2015-04-14 04:32:24.000000000 +0000 +++ jsoup-1.8.3/src/test/java/org/jsoup/nodes/FormElementTest.java 2015-08-02 20:18:18.000000000 +0000 @@ -122,4 +122,26 @@ assertEquals("on", data.get(0).value()); assertEquals("foo", data.get(0).key()); } + + @Test public void adoptedFormsRetainInputs() { + // test for https://github.com/jhy/jsoup/issues/249 + String html = "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "
User:
Password:
\n" + + "\n" + + ""; + Document doc = Jsoup.parse(html); + FormElement form = (FormElement) doc.select("form").first(); + List data = form.formData(); + assertEquals(3, data.size()); + assertEquals("user", data.get(0).key()); + assertEquals("pass", data.get(1).key()); + assertEquals("login", data.get(2).key()); + } } diff -Nru jsoup-1.8.2/src/test/java/org/jsoup/nodes/NodeTest.java jsoup-1.8.3/src/test/java/org/jsoup/nodes/NodeTest.java --- jsoup-1.8.2/src/test/java/org/jsoup/nodes/NodeTest.java 2015-04-14 04:32:24.000000000 +0000 +++ jsoup-1.8.3/src/test/java/org/jsoup/nodes/NodeTest.java 2015-08-02 20:18:18.000000000 +0000 @@ -124,6 +124,12 @@ Element a2 = doc.select("a").get(1); assertEquals("http://jsoup.org/path/bar.html?foo", a2.absUrl("href")); } + + @Test public void absHandlesDotFromIndex() { + Document doc = Jsoup.parse("One", "http://example.com"); + Element a1 = doc.select("a").first(); + assertEquals("http://example.com/one/two.html", a1.absUrl("href")); + } @Test public void testRemove() { Document doc = Jsoup.parse("

One two three

"); diff -Nru jsoup-1.8.2/src/test/java/org/jsoup/parser/AttributeParseTest.java jsoup-1.8.3/src/test/java/org/jsoup/parser/AttributeParseTest.java --- jsoup-1.8.2/src/test/java/org/jsoup/parser/AttributeParseTest.java 2015-04-14 04:32:24.000000000 +0000 +++ jsoup-1.8.3/src/test/java/org/jsoup/parser/AttributeParseTest.java 2015-08-02 20:18:18.000000000 +0000 @@ -1,7 +1,11 @@ package org.jsoup.parser; +import java.util.List; + import org.jsoup.Jsoup; +import org.jsoup.nodes.Attribute; import org.jsoup.nodes.Attributes; +import org.jsoup.nodes.BooleanAttribute; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.junit.Test; @@ -66,4 +70,24 @@ Elements els = Jsoup.parse(html).select("a"); assertEquals("&wr_id=123&mid-size=true&ok=&wr", els.first().attr("href")); } + + @Test public void parsesBooleanAttributes() { + String html = ""; + Element el = Jsoup.parse(html).select("a").first(); + + assertEquals("123", el.attr("normal")); + assertEquals("", el.attr("boolean")); + assertEquals("", el.attr("empty")); + + List attributes = el.attributes().asList(); + assertEquals("There should be 3 attribute present", 3, attributes.size()); + + // Assuming the list order always follows the parsed html + assertFalse("'normal' attribute should not be boolean", attributes.get(0) instanceof BooleanAttribute); + assertTrue("'boolean' attribute should be boolean", attributes.get(1) instanceof BooleanAttribute); + assertFalse("'empty' attribute should not be boolean", attributes.get(2) instanceof BooleanAttribute); + + assertEquals(html, el.outerHtml()); + } + } diff -Nru jsoup-1.8.2/src/test/java/org/jsoup/parser/HtmlParserTest.java jsoup-1.8.3/src/test/java/org/jsoup/parser/HtmlParserTest.java --- jsoup-1.8.2/src/test/java/org/jsoup/parser/HtmlParserTest.java 2015-04-14 04:32:24.000000000 +0000 +++ jsoup-1.8.3/src/test/java/org/jsoup/parser/HtmlParserTest.java 2015-08-02 20:18:18.000000000 +0000 @@ -3,10 +3,13 @@ import org.jsoup.Jsoup; import org.jsoup.TextUtil; import org.jsoup.helper.StringUtil; +import org.jsoup.integration.ParseTest; import org.jsoup.nodes.*; import org.jsoup.select.Elements; import org.junit.Test; +import java.io.File; +import java.io.IOException; import java.util.List; import static org.junit.Assert.assertEquals; @@ -43,11 +46,11 @@ String html = "

OneSomething

Else"; // this gets a

with attr '=a' and an OneSomething

\n" + - "Else", doc.body().html()); + assertEquals("

OneSomething

\n" + + "Else", doc.body().html()); doc = Jsoup.parse("

"); - assertEquals("

", doc.body().html()); + assertEquals("

", doc.body().html()); } @Test public void parsesComments() { @@ -847,4 +850,30 @@ assertEquals(50000, doc.body().childNodeSize()); assertTrue(System.currentTimeMillis() - start < 1000); } + + @Test + public void testInvalidTableContents() throws IOException { + File in = ParseTest.getFile("/htmltests/table-invalid-elements.html"); + Document doc = Jsoup.parse(in, "UTF-8"); + doc.outputSettings().prettyPrint(true); + String rendered = doc.toString(); + int endOfEmail = rendered.indexOf("Comment"); + int guarantee = rendered.indexOf("Why am I here?"); + assertTrue("Comment not found", endOfEmail > -1); + assertTrue("Search text not found", guarantee > -1); + assertTrue("Search text did not come after comment", guarantee > endOfEmail); + } + + @Test public void testNormalisesIsIndex() { + Document doc = Jsoup.parse(""); + String html = doc.outerHtml(); + assertEquals("


", + StringUtil.normaliseWhitespace(doc.body().html())); + } + + @Test public void testReinsertionModeForThCelss() { + String body = "
"; + Document doc = Jsoup.parse(body); + assertEquals(1, doc.body().children().size()); + } } diff -Nru jsoup-1.8.2/src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java jsoup-1.8.3/src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java --- jsoup-1.8.2/src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java 2015-04-14 04:32:24.000000000 +0000 +++ jsoup-1.8.3/src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java 2015-08-02 20:18:18.000000000 +0000 @@ -17,8 +17,7 @@ import java.util.List; import static org.jsoup.nodes.Document.OutputSettings.Syntax; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotSame; +import static org.junit.Assert.*; /** * Tests XmlTreeBuilder. @@ -70,13 +69,16 @@ // parse with both xml and html parser, ensure different Document xmlDoc = Jsoup.connect(xmlUrl).parser(Parser.xmlParser()).get(); - Document htmlDoc = Jsoup.connect(xmlUrl).get(); + Document htmlDoc = Jsoup.connect(xmlUrl).parser(Parser.htmlParser()).get(); + Document autoXmlDoc = Jsoup.connect(xmlUrl).get(); // check connection auto detects xml, uses xml parser assertEquals("OneTwoThree", TextUtil.stripNewlines(xmlDoc.html())); - assertNotSame(htmlDoc, xmlDoc); + assertFalse(htmlDoc.equals(xmlDoc)); + assertEquals(xmlDoc, autoXmlDoc); assertEquals(1, htmlDoc.select("head").size()); // html parser normalises assertEquals(0, xmlDoc.select("head").size()); // xml parser does not + assertEquals(0, autoXmlDoc.select("head").size()); // xml parser does not } @Test @@ -121,4 +123,11 @@ Document doc = Jsoup.parse("x", "", Parser.xmlParser()); assertEquals(Syntax.xml, doc.outputSettings().syntax()); } + + @Test + public void testDoesHandleEOFInTag() { + String html = "", xmlDoc.html()); + } } diff -Nru jsoup-1.8.2/src/test/resources/htmltests/table-invalid-elements.html jsoup-1.8.3/src/test/resources/htmltests/table-invalid-elements.html --- jsoup-1.8.2/src/test/resources/htmltests/table-invalid-elements.html 1970-01-01 00:00:00.000000000 +0000 +++ jsoup-1.8.3/src/test/resources/htmltests/table-invalid-elements.html 2015-08-02 20:18:18.000000000 +0000 @@ -0,0 +1,17 @@ + + + + + + +
+ + + +
+

Why am I here?

+ +
+
+ +