# Resolve Fatal Error: java.lang.OutOfMemoryError: PermGen space
JAVA_OPTS="-server -XX:PermSize=256M -XX:MaxPermSize=512M"

JAVA_OPTS="-server -XX:PermSize=256M -XX:MaxPermSize=512M"

Construct | Matches |
---|---|
Characters | |
x | The character x |
\\ | The backslash character |
\0n | The character with octal value 0n (0 <= n <= 7) |
\0nn | The character with octal value 0nn (0 <= n <= 7) |
\0mnn | The character with octal value 0mnn (0 <= m <= 3, 0 <= n <= 7) |
\xhh | The character with hexadecimal value 0xhh |
\uhhhh | The character with hexadecimal value 0xhhhh |
\t | The tab character ('\u0009') |
\n | The newline (line feed) character ('\u000A') |
\r | The carriage-return character ('\u000D') |
\f | The form-feed character ('\u000C') |
\a | The alert (bell) character ('\u0007') |
\e | The escape character ('\u001B') |
\cx | The control character corresponding to x |
Character classes | |
[abc] | a, b, or c (simple class) |
[^abc] | Any character except a, b, or c (negation) |
[a-zA-Z] | a through z or A through Z, inclusive (range) |
[a-d[m-p]] | a through d, or m through p: [a-dm-p] (union) |
[a-z&&[def]] | d, e, or f (intersection) |
[a-z&&[^bc]] | a through z, except for b and c: [ad-z] (subtraction) |
[a-z&&[^m-p]] | a through z, and not m through p: [a-lq-z](subtraction) |
Predefined character classes | |
. | Any character (may or may not match line terminators) |
\d | A digit: [0-9] |
\D | A non-digit: [^0-9] |
\s | A whitespace character: [ \t\n\x0B\f\r] |
\S | A non-whitespace character: [^\s] |
\w | A word character: [a-zA-Z_0-9] |
\W | A non-word character: [^\w] |
POSIX character classes (US-ASCII only) | |
\p{Lower} | A lower-case alphabetic character: [a-z] |
\p{Upper} | An upper-case alphabetic character:[A-Z] |
\p{ASCII} | All ASCII:[\x00-\x7F] |
\p{Alpha} | An alphabetic character:[\p{Lower}\p{Upper}] |
\p{Digit} | A decimal digit: [0-9] |
\p{Alnum} | An alphanumeric character:[\p{Alpha}\p{Digit}] |
\p{Punct} | Punctuation: One of !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ |
\p{Graph} | A visible character: [\p{Alnum}\p{Punct}] |
\p{Print} | A printable character: [\p{Graph}\x20] |
\p{Blank} | A space or a tab: [ \t] |
\p{Cntrl} | A control character: [\x00-\x1F\x7F] |
\p{XDigit} | A hexadecimal digit: [0-9a-fA-F] |
\p{Space} | A whitespace character: [ \t\n\x0B\f\r] |
java.lang.Character classes (simple java character type) | |
\p{javaLowerCase} | Equivalent to java.lang.Character.isLowerCase() |
\p{javaUpperCase} | Equivalent to java.lang.Character.isUpperCase() |
\p{javaWhitespace} | Equivalent to java.lang.Character.isWhitespace() |
\p{javaMirrored} | Equivalent to java.lang.Character.isMirrored() |
Classes for Unicode blocks and categories | |
\p{InGreek} | A character in the Greek block (simple block) |
\p{Lu} | An uppercase letter (simple category) |
\p{Sc} | A currency symbol |
\P{InGreek} | Any character except one in the Greek block (negation) |
[\p{L}&&[^\p{Lu}]] | Any letter except an uppercase letter (subtraction) |
Boundary matchers | |
^ | The beginning of a line |
$ | The end of a line |
\b | A word boundary |
\B | A non-word boundary |
\A | The beginning of the input |
\G | The end of the previous match |
\Z | The end of the input but for the final terminator, if any |
\z | The end of the input |
Greedy quantifiers | |
X? | X, once or not at all |
X* | X, zero or more times |
X+ | X, one or more times |
X{n} | X, exactly n times |
X{n,} | X, at least n times |
X{n,m} | X, at least n but not more than m times |
Reluctant quantifiers | |
X?? | X, once or not at all |
X*? | X, zero or more times |
X+? | X, one or more times |
X{n}? | X, exactly n times |
X{n,}? | X, at least n times |
X{n,m}? | X, at least n but not more than m times |
Possessive quantifiers | |
X?+ | X, once or not at all |
X*+ | X, zero or more times |
X++ | X, one or more times |
X{n}+ | X, exactly n times |
X{n,}+ | X, at least n times |
X{n,m}+ | X, at least n but not more than m times |
Logical operators | |
XY | X followed by Y |
X|Y | Either X or Y |
(X) | X, as a capturing group |
Back references | |
\n | Whatever the nth href="#cg">capturing group matched |
Quotation | |
\ | Nothing, but quotes the following character |
\Q | Nothing, but quotes all characters until \E |
\E | Nothing, but ends quoting started by \Q |
Special constructs (non-capturing) | |
(?:X) | X, as a non-capturing group |
(?idmsux-idmsux) | Nothing, but turns match flags i href="#UNIX_LINES">d m s href="#UNICODE_CASE">u x on - off |
(?idmsux-idmsux:X) | X, as a non-capturing group with the given flags i d m s u x on - off |
(?=X) | X, via zero-width positive lookahead |
(?!X) | X, via zero-width negative lookahead |
(?<=X) | X, via zero-width positive lookbehind |
(?X) | X, via zero-width negative lookbehind |
(?>X) | X, as an independent, non-capturing group |
Match and regex modes | |
Pattern.UNIX_LINES - (?d) | Changes how dot and ^ match |
Pattern.DOTALL - (?s) | Causes dot to match any character |
Pattern.MULTILINE - (?m) | Expands where ^ and $ can match |
Pattern.COMMENTS - (?x) | Free-spacing and comment mode (Applies even inside character classes) |
Pattern.CASE_INSENSITIVE - (?i) | Case-insensitive matching for ASCII characters |
Pattern.UNICODE_CASE - (?u) | Case-insensitive matching for non-ASCII characters |
Pattern.CANON_EQ | Unicode "canonical equivalence" match mode (different encodings of the same character match as identical) |
Pattern.LITERAL | Treat the regex argument as plain, literal text instead of as a regular expression |
<br /><br />package sa.cdc.svn.service.repos;<br /><br />import java.io.BufferedReader;<br />import java.io.IOException;<br />import java.io.InputStream;<br />import java.io.InputStreamReader;<br />import java.util.regex.Matcher;<br />import java.util.regex.Pattern;<br /><br /><br />public class RegularExpression {<br /> /* Simple Regex Test */<br /> public void simpleRegexTest() {<br /> String regex = "\\d+\\w+";<br /> String input = "This is my 1st test string, soon will the 2nd come.";<br /><br /> // match like [groups]<br /> regex = "\\[([^\\[]*)\\]";<br /> input = "[groups][aliases][authzPath]";<br /><br /> // match number except 3,4,5<br /> regex = "[0-9&&[^345]]";<br /> input = "6";<br /><br /> regex = "a{3,6}";<br /> input = "aaaaaaaaa";<br /><br /> regex = "(dog){3}";<br /> input = "dogdogdogdogdog";<br /><br /> regex = "[abc]{3}";<br /> input = "abccabaaaccbbbc";<br /><br /> // Reluctant quanlifiers<br /> regex = ".*?foo";<br /> input = "xfooxxxxxxfoo";<br /><br /> // Refer to group index<br /> regex = "(\\d\\d)\\1";<br /> input = "1212";<br /><br /> // Start with dog<br /> regex = "^dog\\w*";<br /> input = "dogblahblah";<br /><br /> // A word boundary<br /> regex = "\\bdog\\b";<br /> input = "The dog plays in the yard.";<br /><br /> // A non-word boundary<br /> regex = "\\bdog\\B";<br /> input = "The doggie plays in the yard.";<br /><br /> // The end of the previous match<br /> regex = "\\Gdog";<br /> input = "dogdog dog";<br /><br /> // Need to set Pattern.CASE_INSENSITIVE;<br /> regex = "dog";<br /> input = "DoGDOg";<br /><br /> // (?i) means case insensitive<br /> regex = "(?i)dog";<br /> input = "DoGDOg";<br /><br /> regex = "foo";<br /> input = "fooooooooooooooooo";<br /><br /> regex = "a*b";<br /> input = "aabfooaabfooabfoob";<br /><br /> // match email address<br /> regex = "\\w+([-+.]\\w+)*@\\w+([-.]\\w+)*\\.\\w+([-.]\\w+)*";<br /> input = "as_bc@sie.com";<br /><br /> // match a url<br /> regex = "^[a-zA-z]+://(\\w+(-\\w+)*)(\\.(\\w+(-\\w+)*))*(\\?\\S*)?$";<br /> input = "http://abc.doe?";<br /><br /> // match a word with only digital and 26 letters<br /> regex = "^[A-Za-z0-9]+$"; // "^w+$"<br /> input = "123abc3sdf323";<br /><br /> // match a chinese id<br /> regex = "\\d{15}|\\d{18}";<br /> input = "44010484646354875834";<br /><br /> // match a chinese local phone<br /> regex = "\\d{3}-\\d{8}|\\d{4}-\\d{7}";<br /> input = "0319-8473645";<br /><br /> // match a chinese ip<br /> regex = "\\d+\\.\\d+\\.\\d+\\.\\d+";<br /> input = "61.144.43.235";<br /><br /> // match an integer<br /> regex = "^-?[1-9]\\d*|0$";<br /> input = "0";<br /><br /> // match an <a></a><br /> regex = "<(\\S*?)[^>]*>.*?</\1>|<.*?/>";<br /> input = "<abc>delphi<abc/>";<br /><br /> // match whitespace before or after a line<br /> regex = "^\\s*|\\s*$";<br /> input = "<abc>delphi<abc/> ";<br /><br /> // match a QQ number<br /> regex = "[1-9][0-9]{4,}";<br /> input = "8646354";<br /><br /> // match a date<br /> regex = "^(\\d{2}|\\d{4})-((0([1-9]{1}))|(1[1|2]))-(([0-2]([1-9]{1}))|(3[0|1]))$";<br /> input = "89-02-12";<br /><br /> // match chinese words<br /> regex = "[\u4e00-\u9fa5]";<br /> input = "志气";<br /><br /> // match unicode (two byte) character<br /> // String.prototype.len=function(){return this.replace([^x00-xff]/g,"aa").length;}<br /> regex = "[^\\x00-\\xff]";<br /> input = "志气";<br /><br /> // match empty line<br /> regex = "\\n\\s*\\r";<br /> input = "\n\r";<br /><br /> // match a float<br /> regex = "^(-?\\d+)(\\.\\d+)?$";<br /> input = "-123.23";<br /><br /> // match a date<br /> regex = "^(\\d{2}|\\d{4})-((0([1-9]{1}))|(1[1|2]))-(([0-2]([1-9]{1}))|(3[0|1]))$";<br /> input = "1989-02-12";<br /><br /> Pattern pattern = Pattern.compile(regex);<br /> Matcher matcher = pattern.matcher(input);<br /> boolean found = false;<br /> while (matcher.find()) {<br /> System.out.println("Found the text \"" + matcher.group() + "\", start at "<br /> + matcher.start() + ", end at " + matcher.end());<br /> found = true;<br /> }<br /> if (!found) {<br /> System.out.println("No match found.");<br /> }<br /> }<br /><br /> /* Parse A Structured File/Log */<br /> public void parseAuthzFile() {<br /> try {<br /> InputStream stream = getClass().getResourceAsStream("authz");<br /> BufferedReader reader = new BufferedReader(new InputStreamReader(stream));<br /><br /> StringBuilder authz = new StringBuilder();<br /> String line = null;<br /> while ((line = reader.readLine()) != null) {<br /> authz.append(line);<br /> authz.append('\n');<br /> }<br /><br /> // begins with [ and ends with ]<br /> String regex = "^\\[([^\\[]*)\\]$";<br /> String input = authz.toString();<br /><br /> Pattern pattern = Pattern.compile(regex, Pattern.MULTILINE);<br /> Matcher matcher = pattern.matcher(input);<br /><br /> int location = 0;<br /> boolean found = false;<br /> // add global comments of the authz file<br /> if (matcher.find()) {<br /> System.out.println(authz.substring(location, matcher.start()));<br /> location = matcher.start();<br /> found = true;<br /> }<br /> // add each segment<br /> String segment = null;<br /> while (matcher.find()) {<br /> segment = authz.substring(location, matcher.start());<br /> location = matcher.start();<br /> System.out.print(segment);<br /> System.out.println("segment:" + matcher.group(1));<br /> }<br /> // then last segment<br /> if (found) {<br /> segment = authz.substring(location);<br /> System.out.print(segment);<br /> }<br /> } catch (IOException e) {<br /> e.printStackTrace();<br /> }<br /> }<br /><br /> public void splitInput() {<br /> Pattern pattern = Pattern.compile("\\d");<br /> String input = "one9two4three7four1five";<br /> String[] items = pattern.split(input);<br /> for (String item : items) {<br /> System.out.println(item);<br /> }<br /> }<br /><br /> public void identifyURL() {<br /> String url = "https://regex.info:8080/blog/article.do?id=123";<br /> String regex = "(?x) ^(https?):// ([^/:]+) (:(\\d+))? (.*)";<br /> Matcher m = Pattern.compile(regex).matcher(url);<br /><br /> if (m.matches()) {<br /> System.out.print("Overall [" + m.group() + "]" + " (from " + m.start() + " to "<br /> + m.end() + ")\n" + "Protocol [" + m.group(1) + "]" + " (from " + m.start(1)<br /> + " to " + m.end(1) + ")\n" + "Hostname [" + m.group(2) + "]" + " (from "<br /> + m.start(2) + " to " + m.end(2) + ")\n");<br /> // Group #3 might not have participated, so we must be careful here<br /> if (m.group(3) == null)<br /> System.out.println("No port; default of '80' is assumed");<br /> else {<br /> System.out.print("Port is [" + m.group(4) + "] " + "(from " + m.start(4) + " to "<br /> + m.end(4) + ")\n");<br /> }<br /> // Group #5 might also not have participated<br /> if (m.group(5) == null) {<br /> System.out.println("No path specified");<br /> } else {<br /> System.out.println("Path is [" + m.group(5) + "] " + "(from " + m.start(5) + " to "<br /> + m.end(5) + ")\n");<br /> }<br /> }<br /> }<br /><br /> public void searchAndReplace() {<br /> String regex = "\\bJava\\s*1\\.5\\b";<br /> String input = "Before Java 1.5 was Java 1.4.2. After Java 1.5 is Java 1.6";<br /> Matcher matcher = Pattern.compile(regex).matcher(input);<br /><br /> String result = matcher.replaceAll("Java 5.0");<br /> System.out.println("Replace all: " + result);<br /><br /> matcher.reset();<br /> result = matcher.replaceFirst("Java 5.0");<br /> System.out.println("Replace first: " + result);<br /><br /> matcher.reset();<br /> // You can convert "Java 1.6" to "Java 6.0" as well.<br /> result = Pattern.compile("\\bJava\\s*1\\.([56])\\b").matcher(input).replaceAll("Java $1.0");<br /> // $1\2 means the replace text will be followed by 2<br /> // $12 means the group(12) is the replacement text<br /> System.out.println("Argument replace: " + result);<br /><br /> matcher.reset();<br /> // Use wierd replacement text correctly<br /> result = matcher.replaceAll(Matcher.quoteReplacement("Java \\. $2 5.0"));<br /> System.out.println("Quote replacement: " + result);<br /><br /> matcher.reset();<br /> StringBuffer sb = new StringBuffer();<br /> while (matcher.find()) {<br /> matcher.appendReplacement(sb, "Java 5.0");<br /> System.out.println("Append replacement: " + sb.toString());<br /> }<br /> matcher.appendTail(sb);<br /> System.out.println("Append replacement: " + sb.toString());<br /><br /> // Convert Celsius temperatures to Fahrenheit<br /> input = "from 36.3C to 40.1C.";<br /> // ?: means non-capturing group, here the group count is actually 1<br /> matcher = Pattern.compile("(\\d+(?:\\.\\d*)?)C\\b").matcher(input);<br /> sb = new StringBuffer();<br /> while (matcher.find()) {<br /> float celsius = Float.parseFloat(matcher.group(1));<br /> int fahrenheit = (int) (celsius * 9 / 5 + 32);<br /> matcher.appendReplacement(sb, fahrenheit + "F");<br /> }<br /> matcher.appendTail(sb);<br /> System.out.println("Customized replacement: " + sb.toString());<br /><br /> // In-Place Replacement<br /> StringBuilder text = new StringBuilder("It's SO VERY RUDE to shout!");<br /> matcher = Pattern.compile("\\b[\\p{Lu}\\p{Lt}]+\\b").matcher(text);<br /> int matchPointer = 0;<br /> while (matcher.find(matchPointer)) {<br /> matchPointer = matcher.end();<br /> text.replace(matcher.start(), matcher.end(), "<b>" + matcher.group().toLowerCase()<br /> + "</b>");<br /> matchPointer += 7; // Account for having added '<b>' and '</b>'<br /> }<br /> System.out.println("In-place replacement1: " + text);<br /><br /> matcher.reset();<br /> sb = new StringBuffer();<br /> while (matcher.find()) {<br /> matcher.appendReplacement(sb, "<b>" + matcher.group().toLowerCase() + "</b>");<br /> }<br /> matcher.appendTail(sb);<br /> System.out.println("In-place replacement2: " + sb.toString());<br /><br /> // Transparent bounds<br /> regex = "\\bcar\\b";<br /> input = "Madagascar is best seen by car or bike.";<br /> matcher = Pattern.compile(regex).matcher(input);<br /> matcher.useAnchoringBounds(false);<br /> matcher.useTransparentBounds(true); // try to set false to see difference<br /> matcher.region(7, input.length());<br /> matcher.find();<br /> System.out.println("Matches starting at character " + matcher.start());<br /><br /> // The matcher's region<br /> // Matcher to find an image tag in html content<br /> String html = "a fragment of html text";<br /> // Matcher to find an image tag. The 'html' variable contains the HTML in question<br /> Matcher mImg = Pattern.compile("(?id)<IMG\\s+(.*?)/? />").matcher(html);<br /> // Matcher to find an ALT attribute (to be applied to an IMG tag's body within the same<br /> // 'html' variable)<br /> Matcher mAlt = Pattern.compile("(?ix)\\b ALT \\s* =").matcher(html);<br /> // Matcher to find a newline<br /> Matcher mLine = Pattern.compile("\\n").matcher(html);<br /><br /> // For each image tag within the html ...<br /> while (mImg.find()) {<br /> // Restrict the next ALT search to the body of the just-found image tag<br /> mAlt.region(mImg.start(1), mImg.end(1));<br /> // Report an error if no ALT found, showing the whole image tag found above<br /> if (!mAlt.find()) {<br /> // Restrict counting of newlines to the text before the start of the image tag<br /> mLine.region(0, mImg.start());<br /> int lineNum = 1; // The first line is numbered 1<br /> while (mLine.find())<br /> lineNum++; // Each newline bumps up the line number<br /> System.out.println("Missing ALT attribute on line " + lineNum);<br /> } else {<br /> System.out.println("Found ALT attribute, start at " + mAlt.start() + ", end at "<br /> + mAlt.end());<br /> }<br /> }<br /><br /> }<br /><br /> public static void main(String[] args) {<br /> RegularExpression regex = new RegularExpression();<br /> regex.simpleRegexTest();<br /> // regex.parseAuthzFile();<br /> // regex.splitInput();<br /> // regex.identifyURL();<br /> regex.searchAndReplace();<br /> }<br />}<br /><br />
Construct | Matches |
---|---|
Characters | |
x | The character x |
\\ | The backslash character |
\0n | The character with octal value 0n (0 <= n <= 7) |
\0nn | The character with octal value 0nn (0 <= n <= 7) |
\0mnn | The character with octal value 0mnn (0 <= m <= 3, 0 <= n <= 7) |
\xhh | The character with hexadecimal value 0xhh |
\uhhhh | The character with hexadecimal value 0xhhhh |
\t | The tab character ('\u0009') |
\n | The newline (line feed) character ('\u000A') |
\r | The carriage-return character ('\u000D') |
\f | The form-feed character ('\u000C') |
\a | The alert (bell) character ('\u0007') |
\e | The escape character ('\u001B') |
\cx | The control character corresponding to x |
Character classes | |
[abc] | a, b, or c (simple class) |
[^abc] | Any character except a, b, or c (negation) |
[a-zA-Z] | a through z or A through Z, inclusive (range) |
[a-d[m-p]] | a through d, or m through p: [a-dm-p] (union) |
[a-z&&[def]] | d, e, or f (intersection) |
[a-z&&[^bc]] | a through z, except for b and c: [ad-z] (subtraction) |
[a-z&&[^m-p]] | a through z, and not m through p: [a-lq-z](subtraction) |
Predefined character classes | |
. | Any character (may or may not match line terminators) |
\d | A digit: [0-9] |
\D | A non-digit: [^0-9] |
\s | A whitespace character: [ \t\n\x0B\f\r] |
\S | A non-whitespace character: [^\s] |
\w | A word character: [a-zA-Z_0-9] |
\W | A non-word character: [^\w] |
POSIX character classes (US-ASCII only) | |
\p{Lower} | A lower-case alphabetic character: [a-z] |
\p{Upper} | An upper-case alphabetic character:[A-Z] |
\p{ASCII} | All ASCII:[\x00-\x7F] |
\p{Alpha} | An alphabetic character:[\p{Lower}\p{Upper}] |
\p{Digit} | A decimal digit: [0-9] |
\p{Alnum} | An alphanumeric character:[\p{Alpha}\p{Digit}] |
\p{Punct} | Punctuation: One of !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ |
\p{Graph} | A visible character: [\p{Alnum}\p{Punct}] |
\p{Print} | A printable character: [\p{Graph}\x20] |
\p{Blank} | A space or a tab: [ \t] |
\p{Cntrl} | A control character: [\x00-\x1F\x7F] |
\p{XDigit} | A hexadecimal digit: [0-9a-fA-F] |
\p{Space} | A whitespace character: [ \t\n\x0B\f\r] |
java.lang.Character classes (simple java character type) | |
\p{javaLowerCase} | Equivalent to java.lang.Character.isLowerCase() |
\p{javaUpperCase} | Equivalent to java.lang.Character.isUpperCase() |
\p{javaWhitespace} | Equivalent to java.lang.Character.isWhitespace() |
\p{javaMirrored} | Equivalent to java.lang.Character.isMirrored() |
Classes for Unicode blocks and categories | |
\p{InGreek} | A character in the Greek block (simple block) |
\p{Lu} | An uppercase letter (simple category) |
\p{Sc} | A currency symbol |
\P{InGreek} | Any character except one in the Greek block (negation) |
[\p{L}&&[^\p{Lu}]] | Any letter except an uppercase letter (subtraction) |
Boundary matchers | |
^ | The beginning of a line |
$ | The end of a line |
\b | A word boundary |
\B | A non-word boundary |
\A | The beginning of the input |
\G | The end of the previous match |
\Z | The end of the input but for the final terminator, if any |
\z | The end of the input |
Greedy quantifiers | |
X? | X, once or not at all |
X* | X, zero or more times |
X+ | X, one or more times |
X{n} | X, exactly n times |
X{n,} | X, at least n times |
X{n,m} | X, at least n but not more than m times |
Reluctant quantifiers | |
X?? | X, once or not at all |
X*? | X, zero or more times |
X+? | X, one or more times |
X{n}? | X, exactly n times |
X{n,}? | X, at least n times |
X{n,m}? | X, at least n but not more than m times |
Possessive quantifiers | |
X?+ | X, once or not at all |
X*+ | X, zero or more times |
X++ | X, one or more times |
X{n}+ | X, exactly n times |
X{n,}+ | X, at least n times |
X{n,m}+ | X, at least n but not more than m times |
Logical operators | |
XY | X followed by Y |
X|Y | Either X or Y |
(X) | X, as a capturing group |
Back references | |
\n | Whatever the nth href="#cg">capturing group matched |
Quotation | |
\ | Nothing, but quotes the following character |
\Q | Nothing, but quotes all characters until \E |
\E | Nothing, but ends quoting started by \Q |
Special constructs (non-capturing) | |
(?:X) | X, as a non-capturing group |
(?idmsux-idmsux) | Nothing, but turns match flags i href="#UNIX_LINES">d m s href="#UNICODE_CASE">u x on - off |
(?idmsux-idmsux:X) | X, as a non-capturing group with the given flags i d m s u x on - off |
(?=X) | X, via zero-width positive lookahead |
(?!X) | X, via zero-width negative lookahead |
(?<=X) | X, via zero-width positive lookbehind |
(?<!X) | X, via zero-width negative lookbehind |
(?>X) | X, as an independent, non-capturing group |
Match and regex modes | |
Pattern.UNIX_LINES - (?d) | Changes how dot and ^ match |
Pattern.DOTALL - (?s) | Causes dot to match any character |
Pattern.MULTILINE - (?m) | Expands where ^ and $ can match |
Pattern.COMMENTS - (?x) | Free-spacing and comment mode (Applies even inside character classes) |
Pattern.CASE_INSENSITIVE - (?i) | Case-insensitive matching for ASCII characters |
Pattern.UNICODE_CASE - (?u) | Case-insensitive matching for non-ASCII characters |
Pattern.CANON_EQ | Unicode "canonical equivalence" match mode (different encodings of the same character match as identical) |
Pattern.LITERAL | Treat the regex argument as plain, literal text instead of as a regular expression |
package sa.cdc.svn.service.repos;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class RegularExpression {
/* Simple Regex Test */
public void simpleRegexTest() {
String regex = "\\d+\\w+";
String input = "This is my 1st test string, soon will the 2nd come.";
// match like [groups]
regex = "\\[([^\\[]*)\\]";
input = "[groups][aliases][authzPath]";
// match number except 3,4,5
regex = "[0-9&&[^345]]";
input = "6";
regex = "a{3,6}";
input = "aaaaaaaaa";
regex = "(dog){3}";
input = "dogdogdogdogdog";
regex = "[abc]{3}";
input = "abccabaaaccbbbc";
// Reluctant quanlifiers
regex = ".*?foo";
input = "xfooxxxxxxfoo";
// Refer to group index
regex = "(\\d\\d)\\1";
input = "1212";
// Start with dog
regex = "^dog\\w*";
input = "dogblahblah";
// A word boundary
regex = "\\bdog\\b";
input = "The dog plays in the yard.";
// A non-word boundary
regex = "\\bdog\\B";
input = "The doggie plays in the yard.";
// The end of the previous match
regex = "\\Gdog";
input = "dogdog dog";
// Need to set Pattern.CASE_INSENSITIVE;
regex = "dog";
input = "DoGDOg";
// (?i) means case insensitive
regex = "(?i)dog";
input = "DoGDOg";
regex = "foo";
input = "fooooooooooooooooo";
regex = "a*b";
input = "aabfooaabfooabfoob";
// match email address
regex = "\\w+([-+.]\\w+)*@\\w+([-.]\\w+)*\\.\\w+([-.]\\w+)*";
input = "as_bc@sie.com";
// match a url
regex = "^[a-zA-z]+://(\\w+(-\\w+)*)(\\.(\\w+(-\\w+)*))*(\\?\\S*)?$";
input = "http://abc.doe?";
// match a word with only digital and 26 letters
regex = "^[A-Za-z0-9]+$"; // "^w+$"
input = "123abc3sdf323";
// match a chinese id
regex = "\\d{15}|\\d{18}";
input = "44010484646354875834";
// match a chinese local phone
regex = "\\d{3}-\\d{8}|\\d{4}-\\d{7}";
input = "0319-8473645";
// match a chinese ip
regex = "\\d+\\.\\d+\\.\\d+\\.\\d+";
input = "61.144.43.235";
// match an integer
regex = "^-?[1-9]\\d*|0$";
input = "0";
// match an
regex = "<(\\S*?)[^>]*>.*?\1>|<.*?/>";
input = "delphi ";
// match whitespace before or after a line
regex = "^\\s*|\\s*$";
input = "delphi ";
// match a QQ number
regex = "[1-9][0-9]{4,}";
input = "8646354";
// match a date
regex = "^(\\d{2}|\\d{4})-((0([1-9]{1}))|(1[1|2]))-(([0-2]([1-9]{1}))|(3[0|1]))$";
input = "89-02-12";
// match chinese words
regex = "[\u4e00-\u9fa5]";
input = "志气";
// match unicode (two byte) character
// String.prototype.len=function(){return this.replace([^x00-xff]/g,"aa").length;}
regex = "[^\\x00-\\xff]";
input = "志气";
// match empty line
regex = "\\n\\s*\\r";
input = "\n\r";
// match a float
regex = "^(-?\\d+)(\\.\\d+)?$";
input = "-123.23";
// match a date
regex = "^(\\d{2}|\\d{4})-((0([1-9]{1}))|(1[1|2]))-(([0-2]([1-9]{1}))|(3[0|1]))$";
input = "1989-02-12";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(input);
boolean found = false;
while (matcher.find()) {
System.out.println("Found the text \"" + matcher.group() + "\", start at "
+ matcher.start() + ", end at " + matcher.end());
found = true;
}
if (!found) {
System.out.println("No match found.");
}
}
/* Parse A Structured File/Log */
public void parseAuthzFile() {
try {
InputStream stream = getClass().getResourceAsStream("authz");
BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
StringBuilder authz = new StringBuilder();
String line = null;
while ((line = reader.readLine()) != null) {
authz.append(line);
authz.append('\n');
}
// begins with [ and ends with ]
String regex = "^\\[([^\\[]*)\\]$";
String input = authz.toString();
Pattern pattern = Pattern.compile(regex, Pattern.MULTILINE);
Matcher matcher = pattern.matcher(input);
int location = 0;
boolean found = false;
// add global comments of the authz file
if (matcher.find()) {
System.out.println(authz.substring(location, matcher.start()));
location = matcher.start();
found = true;
}
// add each segment
String segment = null;
while (matcher.find()) {
segment = authz.substring(location, matcher.start());
location = matcher.start();
System.out.print(segment);
System.out.println("segment:" + matcher.group(1));
}
// then last segment
if (found) {
segment = authz.substring(location);
System.out.print(segment);
}
} catch (IOException e) {
e.printStackTrace();
}
}
public void splitInput() {
Pattern pattern = Pattern.compile("\\d");
String input = "one9two4three7four1five";
String[] items = pattern.split(input);
for (String item : items) {
System.out.println(item);
}
}
public void identifyURL() {
String url = "https://regex.info:8080/blog/article.do?id=123";
String regex = "(?x) ^(https?):// ([^/:]+) (:(\\d+))? (.*)";
Matcher m = Pattern.compile(regex).matcher(url);
if (m.matches()) {
System.out.print("Overall [" + m.group() + "]" + " (from " + m.start() + " to "
+ m.end() + ")\n" + "Protocol [" + m.group(1) + "]" + " (from " + m.start(1)
+ " to " + m.end(1) + ")\n" + "Hostname [" + m.group(2) + "]" + " (from "
+ m.start(2) + " to " + m.end(2) + ")\n");
// Group #3 might not have participated, so we must be careful here
if (m.group(3) == null)
System.out.println("No port; default of '80' is assumed");
else {
System.out.print("Port is [" + m.group(4) + "] " + "(from " + m.start(4) + " to "
+ m.end(4) + ")\n");
}
// Group #5 might also not have participated
if (m.group(5) == null) {
System.out.println("No path specified");
} else {
System.out.println("Path is [" + m.group(5) + "] " + "(from " + m.start(5) + " to "
+ m.end(5) + ")\n");
}
}
}
public void searchAndReplace() {
String regex = "\\bJava\\s*1\\.5\\b";
String input = "Before Java 1.5 was Java 1.4.2. After Java 1.5 is Java 1.6";
Matcher matcher = Pattern.compile(regex).matcher(input);
String result = matcher.replaceAll("Java 5.0");
System.out.println("Replace all: " + result);
matcher.reset();
result = matcher.replaceFirst("Java 5.0");
System.out.println("Replace first: " + result);
matcher.reset();
// You can convert "Java 1.6" to "Java 6.0" as well.
result = Pattern.compile("\\bJava\\s*1\\.([56])\\b").matcher(input).replaceAll("Java $1.0");
// $1\2 means the replace text will be followed by 2
// $12 means the group(12) is the replacement text
System.out.println("Argument replace: " + result);
matcher.reset();
// Use wierd replacement text correctly
result = matcher.replaceAll(Matcher.quoteReplacement("Java \\. $2 5.0"));
System.out.println("Quote replacement: " + result);
matcher.reset();
StringBuffer sb = new StringBuffer();
while (matcher.find()) {
matcher.appendReplacement(sb, "Java 5.0");
System.out.println("Append replacement: " + sb.toString());
}
matcher.appendTail(sb);
System.out.println("Append replacement: " + sb.toString());
// Convert Celsius temperatures to Fahrenheit
input = "from 36.3C to 40.1C.";
// ?: means non-capturing group, here the group count is actually 1
matcher = Pattern.compile("(\\d+(?:\\.\\d*)?)C\\b").matcher(input);
sb = new StringBuffer();
while (matcher.find()) {
float celsius = Float.parseFloat(matcher.group(1));
int fahrenheit = (int) (celsius * 9 / 5 + 32);
matcher.appendReplacement(sb, fahrenheit + "F");
}
matcher.appendTail(sb);
System.out.println("Customized replacement: " + sb.toString());
// In-Place Replacement
StringBuilder text = new StringBuilder("It's SO VERY RUDE to shout!");
matcher = Pattern.compile("\\b[\\p{Lu}\\p{Lt}]+\\b").matcher(text);
int matchPointer = 0;
while (matcher.find(matchPointer)) {
matchPointer = matcher.end();
text.replace(matcher.start(), matcher.end(), "" + matcher.group().toLowerCase()
+ "");
matchPointer += 7; // Account for having added '' and ''
}
System.out.println("In-place replacement1: " + text);
matcher.reset();
sb = new StringBuffer();
while (matcher.find()) {
matcher.appendReplacement(sb, "" + matcher.group().toLowerCase() + "");
}
matcher.appendTail(sb);
System.out.println("In-place replacement2: " + sb.toString());
// Transparent bounds
regex = "\\bcar\\b";
input = "Madagascar is best seen by car or bike.";
matcher = Pattern.compile(regex).matcher(input);
matcher.useAnchoringBounds(false);
matcher.useTransparentBounds(true); // try to set false to see difference
matcher.region(7, input.length());
matcher.find();
System.out.println("Matches starting at character " + matcher.start());
// The matcher's region
// Matcher to find an image tag in html content
String html = "a fragment of html text";
// Matcher to find an image tag. The 'html' variable contains the HTML in question
Matcher mImg = Pattern.compile("(?id)").matcher(html);
// Matcher to find an ALT attribute (to be applied to an IMG tag's body within the same
// 'html' variable)
Matcher mAlt = Pattern.compile("(?ix)\\b ALT \\s* =").matcher(html);
// Matcher to find a newline
Matcher mLine = Pattern.compile("\\n").matcher(html);
// For each image tag within the html ...
while (mImg.find()) {
// Restrict the next ALT search to the body of the just-found image tag
mAlt.region(mImg.start(1), mImg.end(1));
// Report an error if no ALT found, showing the whole image tag found above
if (!mAlt.find()) {
// Restrict counting of newlines to the text before the start of the image tag
mLine.region(0, mImg.start());
int lineNum = 1; // The first line is numbered 1
while (mLine.find())
lineNum++; // Each newline bumps up the line number
System.out.println("Missing ALT attribute on line " + lineNum);
} else {
System.out.println("Found ALT attribute, start at " + mAlt.start() + ", end at "
+ mAlt.end());
}
}
}
public static void main(String[] args) {
RegularExpression regex = new RegularExpression();
regex.simpleRegexTest();
// regex.parseAuthzFile();
// regex.splitInput();
// regex.identifyURL();
regex.searchAndReplace();
}
}