|
| 1 | +/* |
| 2 | + * Copyright (C) 2017 Square, Inc. |
| 3 | + * |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | + * you may not use this file except in compliance with the License. |
| 6 | + * You may obtain a copy of the License at |
| 7 | + * |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | + * |
| 10 | + * Unless required by applicable law or agreed to in writing, software |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | + * See the License for the specific language governing permissions and |
| 14 | + * limitations under the License. |
| 15 | + */ |
| 16 | +package org.aoju.bus.http.metric.suffix; |
| 17 | + |
| 18 | +import org.aoju.bus.core.io.source.BufferSource; |
| 19 | +import org.aoju.bus.core.io.source.GzipSource; |
| 20 | +import org.aoju.bus.core.toolkit.IoKit; |
| 21 | + |
| 22 | +import java.io.IOException; |
| 23 | +import java.io.InputStream; |
| 24 | +import java.io.InterruptedIOException; |
| 25 | +import java.net.IDN; |
| 26 | +import java.util.concurrent.CountDownLatch; |
| 27 | +import java.util.concurrent.atomic.AtomicBoolean; |
| 28 | + |
| 29 | +import static java.nio.charset.StandardCharsets.UTF_8; |
| 30 | + |
| 31 | +/** |
| 32 | + * A database of public suffixes provided by |
| 33 | + * <a href="https://publicsuffix.org/">publicsuffix.org</a>. |
| 34 | + */ |
| 35 | +public final class Suffixes { |
| 36 | + |
| 37 | + public static final String PUBLIC_SUFFIX_RESOURCE = "suffixes.gz"; |
| 38 | + |
| 39 | + private static final byte[] WILDCARD_LABEL = new byte[]{'*'}; |
| 40 | + private static final String[] EMPTY_RULE = new String[0]; |
| 41 | + private static final String[] PREVAILING_RULE = new String[]{"*"}; |
| 42 | + |
| 43 | + private static final byte EXCEPTION_MARKER = '!'; |
| 44 | + |
| 45 | + private static final Suffixes instance = new Suffixes(); |
| 46 | + |
| 47 | + /** |
| 48 | + * True after we've attempted to read the list for the first time. |
| 49 | + */ |
| 50 | + private final AtomicBoolean listRead = new AtomicBoolean(false); |
| 51 | + |
| 52 | + /** |
| 53 | + * Used for concurrent threads reading the list for the first time. |
| 54 | + */ |
| 55 | + private final CountDownLatch readCompleteLatch = new CountDownLatch(1); |
| 56 | + |
| 57 | + // The lists are held as a large array of UTF-8 bytes. This is to avoid allocating lots of strings |
| 58 | + // that will likely never be used. Each rule is separated by '\n'. Please see the |
| 59 | + // PublicSuffixListGenerator class for how these lists are generated. |
| 60 | + // Guarded by this. |
| 61 | + private byte[] publicSuffixListBytes; |
| 62 | + private byte[] publicSuffixExceptionListBytes; |
| 63 | + |
| 64 | + public static Suffixes get() { |
| 65 | + return instance; |
| 66 | + } |
| 67 | + |
| 68 | + private static String binarySearchBytes(byte[] bytesToSearch, byte[][] labels, int labelIndex) { |
| 69 | + int low = 0; |
| 70 | + int high = bytesToSearch.length; |
| 71 | + String match = null; |
| 72 | + while (low < high) { |
| 73 | + int mid = (low + high) / 2; |
| 74 | + // Search for a '\n' that marks the start of a value. Don't go back past the start of the |
| 75 | + // array. |
| 76 | + while (mid > -1 && bytesToSearch[mid] != '\n') { |
| 77 | + mid--; |
| 78 | + } |
| 79 | + mid++; |
| 80 | + |
| 81 | + // Now look for the ending '\n'. |
| 82 | + int end = 1; |
| 83 | + while (bytesToSearch[mid + end] != '\n') { |
| 84 | + end++; |
| 85 | + } |
| 86 | + int publicSuffixLength = (mid + end) - mid; |
| 87 | + |
| 88 | + // Compare the bytes. Note that the file stores UTF-8 encoded bytes, so we must compare the |
| 89 | + // unsigned bytes. |
| 90 | + int compareResult; |
| 91 | + int currentLabelIndex = labelIndex; |
| 92 | + int currentLabelByteIndex = 0; |
| 93 | + int publicSuffixByteIndex = 0; |
| 94 | + |
| 95 | + boolean expectDot = false; |
| 96 | + while (true) { |
| 97 | + int byte0; |
| 98 | + if (expectDot) { |
| 99 | + byte0 = '.'; |
| 100 | + expectDot = false; |
| 101 | + } else { |
| 102 | + byte0 = labels[currentLabelIndex][currentLabelByteIndex] & 0xff; |
| 103 | + } |
| 104 | + |
| 105 | + int byte1 = bytesToSearch[mid + publicSuffixByteIndex] & 0xff; |
| 106 | + |
| 107 | + compareResult = byte0 - byte1; |
| 108 | + if (compareResult != 0) break; |
| 109 | + |
| 110 | + publicSuffixByteIndex++; |
| 111 | + currentLabelByteIndex++; |
| 112 | + if (publicSuffixByteIndex == publicSuffixLength) break; |
| 113 | + |
| 114 | + if (labels[currentLabelIndex].length == currentLabelByteIndex) { |
| 115 | + // We've exhausted our current label. Either there are more labels to compare, in which |
| 116 | + // case we expect a dot as the next character. Otherwise, we've checked all our labels. |
| 117 | + if (currentLabelIndex == labels.length - 1) { |
| 118 | + break; |
| 119 | + } else { |
| 120 | + currentLabelIndex++; |
| 121 | + currentLabelByteIndex = -1; |
| 122 | + expectDot = true; |
| 123 | + } |
| 124 | + } |
| 125 | + } |
| 126 | + |
| 127 | + if (compareResult < 0) { |
| 128 | + high = mid - 1; |
| 129 | + } else if (compareResult > 0) { |
| 130 | + low = mid + end + 1; |
| 131 | + } else { |
| 132 | + // We found a match, but are the lengths equal? |
| 133 | + int publicSuffixBytesLeft = publicSuffixLength - publicSuffixByteIndex; |
| 134 | + int labelBytesLeft = labels[currentLabelIndex].length - currentLabelByteIndex; |
| 135 | + for (int i = currentLabelIndex + 1; i < labels.length; i++) { |
| 136 | + labelBytesLeft += labels[i].length; |
| 137 | + } |
| 138 | + |
| 139 | + if (labelBytesLeft < publicSuffixBytesLeft) { |
| 140 | + high = mid - 1; |
| 141 | + } else if (labelBytesLeft > publicSuffixBytesLeft) { |
| 142 | + low = mid + end + 1; |
| 143 | + } else { |
| 144 | + // Found a match. |
| 145 | + match = new String(bytesToSearch, mid, publicSuffixLength, UTF_8); |
| 146 | + break; |
| 147 | + } |
| 148 | + } |
| 149 | + } |
| 150 | + return match; |
| 151 | + } |
| 152 | + |
| 153 | + /** |
| 154 | + * Returns the effective top-level domain plus one (eTLD+1) by referencing the public suffix list. |
| 155 | + * Returns null if the domain is a public suffix or a private address. |
| 156 | + * |
| 157 | + * <p>Here are some examples: <pre>{@code |
| 158 | + * assertEquals("google.com", getEffectiveTldPlusOne("google.com")); |
| 159 | + * assertEquals("google.com", getEffectiveTldPlusOne("www.google.com")); |
| 160 | + * assertNull(getEffectiveTldPlusOne("com")); |
| 161 | + * assertNull(getEffectiveTldPlusOne("localhost")); |
| 162 | + * assertNull(getEffectiveTldPlusOne("mymacbook")); |
| 163 | + * }</pre> |
| 164 | + * |
| 165 | + * @param domain A canonicalized domain. An International Domain Name (IDN) should be punycode |
| 166 | + * encoded. |
| 167 | + */ |
| 168 | + public String getEffectiveTldPlusOne(String domain) { |
| 169 | + if (domain == null) throw new NullPointerException("domain == null"); |
| 170 | + |
| 171 | + // We use UTF-8 in the list so we need to convert to Unicode. |
| 172 | + String unicodeDomain = IDN.toUnicode(domain); |
| 173 | + String[] domainLabels = unicodeDomain.split("\\."); |
| 174 | + String[] rule = findMatchingRule(domainLabels); |
| 175 | + if (domainLabels.length == rule.length && rule[0].charAt(0) != EXCEPTION_MARKER) { |
| 176 | + // The domain is a public suffix. |
| 177 | + return null; |
| 178 | + } |
| 179 | + |
| 180 | + int firstLabelOffset; |
| 181 | + if (rule[0].charAt(0) == EXCEPTION_MARKER) { |
| 182 | + // Exception rules hold the effective TLD plus one. |
| 183 | + firstLabelOffset = domainLabels.length - rule.length; |
| 184 | + } else { |
| 185 | + // Otherwise the rule is for a public suffix, so we must take one more label. |
| 186 | + firstLabelOffset = domainLabels.length - (rule.length + 1); |
| 187 | + } |
| 188 | + |
| 189 | + StringBuilder effectiveTldPlusOne = new StringBuilder(); |
| 190 | + String[] punycodeLabels = domain.split("\\."); |
| 191 | + for (int i = firstLabelOffset; i < punycodeLabels.length; i++) { |
| 192 | + effectiveTldPlusOne.append(punycodeLabels[i]).append('.'); |
| 193 | + } |
| 194 | + effectiveTldPlusOne.deleteCharAt(effectiveTldPlusOne.length() - 1); |
| 195 | + |
| 196 | + return effectiveTldPlusOne.toString(); |
| 197 | + } |
| 198 | + |
| 199 | + private String[] findMatchingRule(String[] domainLabels) { |
| 200 | + if (!listRead.get() && listRead.compareAndSet(false, true)) { |
| 201 | + readTheListUninterruptibly(); |
| 202 | + } else { |
| 203 | + try { |
| 204 | + readCompleteLatch.await(); |
| 205 | + } catch (InterruptedException ignored) { |
| 206 | + Thread.currentThread().interrupt(); // Retain interrupted status. |
| 207 | + } |
| 208 | + } |
| 209 | + |
| 210 | + synchronized (this) { |
| 211 | + if (publicSuffixListBytes == null) { |
| 212 | + throw new IllegalStateException("Unable to load " + PUBLIC_SUFFIX_RESOURCE + " resource " |
| 213 | + + "from the classpath."); |
| 214 | + } |
| 215 | + } |
| 216 | + |
| 217 | + // Break apart the domain into UTF-8 labels, i.e. foo.bar.com turns into [foo, bar, com]. |
| 218 | + byte[][] domainLabelsUtf8Bytes = new byte[domainLabels.length][]; |
| 219 | + for (int i = 0; i < domainLabels.length; i++) { |
| 220 | + domainLabelsUtf8Bytes[i] = domainLabels[i].getBytes(UTF_8); |
| 221 | + } |
| 222 | + |
| 223 | + // Start by looking for exact matches. We start at the leftmost label. For example, foo.bar.com |
| 224 | + // will look like: [foo, bar, com], [bar, com], [com]. The longest matching rule wins. |
| 225 | + String exactMatch = null; |
| 226 | + for (int i = 0; i < domainLabelsUtf8Bytes.length; i++) { |
| 227 | + String rule = binarySearchBytes(publicSuffixListBytes, domainLabelsUtf8Bytes, i); |
| 228 | + if (rule != null) { |
| 229 | + exactMatch = rule; |
| 230 | + break; |
| 231 | + } |
| 232 | + } |
| 233 | + |
| 234 | + // In theory, wildcard rules are not restricted to having the wildcard in the leftmost position. |
| 235 | + // In practice, wildcards are always in the leftmost position. For now, this implementation |
| 236 | + // cheats and does not attempt every possible permutation. Instead, it only considers wildcards |
| 237 | + // in the leftmost position. We assert this fact when we generate the public suffix file. If |
| 238 | + // this assertion ever fails we'll need to refactor this implementation. |
| 239 | + String wildcardMatch = null; |
| 240 | + if (domainLabelsUtf8Bytes.length > 1) { |
| 241 | + byte[][] labelsWithWildcard = domainLabelsUtf8Bytes.clone(); |
| 242 | + for (int labelIndex = 0; labelIndex < labelsWithWildcard.length - 1; labelIndex++) { |
| 243 | + labelsWithWildcard[labelIndex] = WILDCARD_LABEL; |
| 244 | + String rule = binarySearchBytes(publicSuffixListBytes, labelsWithWildcard, labelIndex); |
| 245 | + if (rule != null) { |
| 246 | + wildcardMatch = rule; |
| 247 | + break; |
| 248 | + } |
| 249 | + } |
| 250 | + } |
| 251 | + |
| 252 | + // Exception rules only apply to wildcard rules, so only try it if we matched a wildcard. |
| 253 | + String exception = null; |
| 254 | + if (wildcardMatch != null) { |
| 255 | + for (int labelIndex = 0; labelIndex < domainLabelsUtf8Bytes.length - 1; labelIndex++) { |
| 256 | + String rule = binarySearchBytes( |
| 257 | + publicSuffixExceptionListBytes, domainLabelsUtf8Bytes, labelIndex); |
| 258 | + if (rule != null) { |
| 259 | + exception = rule; |
| 260 | + break; |
| 261 | + } |
| 262 | + } |
| 263 | + } |
| 264 | + |
| 265 | + if (exception != null) { |
| 266 | + // Signal we've identified an exception rule. |
| 267 | + exception = "!" + exception; |
| 268 | + return exception.split("\\."); |
| 269 | + } else if (exactMatch == null && wildcardMatch == null) { |
| 270 | + return PREVAILING_RULE; |
| 271 | + } |
| 272 | + |
| 273 | + String[] exactRuleLabels = exactMatch != null |
| 274 | + ? exactMatch.split("\\.") |
| 275 | + : EMPTY_RULE; |
| 276 | + |
| 277 | + String[] wildcardRuleLabels = wildcardMatch != null |
| 278 | + ? wildcardMatch.split("\\.") |
| 279 | + : EMPTY_RULE; |
| 280 | + |
| 281 | + return exactRuleLabels.length > wildcardRuleLabels.length |
| 282 | + ? exactRuleLabels |
| 283 | + : wildcardRuleLabels; |
| 284 | + } |
| 285 | + |
| 286 | + /** |
| 287 | + * Reads the public suffix list treating the operation as uninterruptible. We always want to read |
| 288 | + * the list otherwise we'll be left in a bad state. If the thread was interrupted prior to this |
| 289 | + * operation, it will be re-interrupted after the list is read. |
| 290 | + */ |
| 291 | + private void readTheListUninterruptibly() { |
| 292 | + boolean interrupted = false; |
| 293 | + try { |
| 294 | + while (true) { |
| 295 | + try { |
| 296 | + readTheList(); |
| 297 | + return; |
| 298 | + } catch (InterruptedIOException e) { |
| 299 | + Thread.interrupted(); // Temporarily clear the interrupted state. |
| 300 | + interrupted = true; |
| 301 | + } catch (IOException e) { |
| 302 | + |
| 303 | + return; |
| 304 | + } |
| 305 | + } |
| 306 | + } finally { |
| 307 | + if (interrupted) { |
| 308 | + Thread.currentThread().interrupt(); // Retain interrupted status. |
| 309 | + } |
| 310 | + } |
| 311 | + } |
| 312 | + |
| 313 | + private void readTheList() throws IOException { |
| 314 | + byte[] publicSuffixListBytes; |
| 315 | + byte[] publicSuffixExceptionListBytes; |
| 316 | + |
| 317 | + InputStream resource = Suffixes.class.getResourceAsStream(PUBLIC_SUFFIX_RESOURCE); |
| 318 | + if (resource == null) return; |
| 319 | + |
| 320 | + try (BufferSource BufferSource = IoKit.buffer(new GzipSource(IoKit.source(resource)))) { |
| 321 | + int totalBytes = BufferSource.readInt(); |
| 322 | + publicSuffixListBytes = new byte[totalBytes]; |
| 323 | + BufferSource.readFully(publicSuffixListBytes); |
| 324 | + |
| 325 | + int totalExceptionBytes = BufferSource.readInt(); |
| 326 | + publicSuffixExceptionListBytes = new byte[totalExceptionBytes]; |
| 327 | + BufferSource.readFully(publicSuffixExceptionListBytes); |
| 328 | + } |
| 329 | + |
| 330 | + synchronized (this) { |
| 331 | + this.publicSuffixListBytes = publicSuffixListBytes; |
| 332 | + this.publicSuffixExceptionListBytes = publicSuffixExceptionListBytes; |
| 333 | + } |
| 334 | + |
| 335 | + readCompleteLatch.countDown(); |
| 336 | + } |
| 337 | + |
| 338 | + /** |
| 339 | + * Visible for testing. |
| 340 | + */ |
| 341 | + void setListBytes(byte[] publicSuffixListBytes, byte[] publicSuffixExceptionListBytes) { |
| 342 | + this.publicSuffixListBytes = publicSuffixListBytes; |
| 343 | + this.publicSuffixExceptionListBytes = publicSuffixExceptionListBytes; |
| 344 | + listRead.set(true); |
| 345 | + readCompleteLatch.countDown(); |
| 346 | + } |
| 347 | + |
| 348 | +} |
0 commit comments