Chapter 25 - Regular Expressions

Regular Expression Basics

Simple Quantifier

Regex.Match ("color",   @"colou?r").Success.Dump();
Regex.Match ("colour",  @"colou?r").Success.Dump();
Regex.Match ("colouur", @"colou?r").Success.Dump();

The Match object

Regex.Match ("any colour you like", @"colou?r")

Using NextMatch

Match m1 = Regex.Match ("One color? There are two colours in my head!", @"colou?rs?");
Match m2 = m1.NextMatch();

m1.Dump ("Match 1");
m2.Dump ("Match 2");

The Matches Method

foreach (Match m in Regex.Matches ("One color? There are two colours in my head!", @"colou?rs?"))
  m.Value.Dump();

Alternators

string r = "Jen(ny|nifer)?";

Regex.IsMatch ("Jenny", r).Dump();
Regex.IsMatch ("Jennifer", r).Dump();
Regex.IsMatch ("Jen", r).Dump();
Regex.IsMatch ("Ben", r).Dump();

Compiled Expressions

Regex r = new Regex (@"sausages?", RegexOptions.Compiled);

r.Match ("sausage").Success.Dump();
r.Match ("sausages").Success.Dump();

RegexOptions

Regex.Match ("a", "A", RegexOptions.IgnoreCase).Value.Dump();
Regex.Match ("a", @"(?i)A").Value.Dump();

Regex.Match ("AAAa", @"(?i)a(?-i)a").Value.Dump();

Character Escapes

//       The Regex metacharacters are as follows:
//
//       \  *  +  ?  |  {  [  (  )  ^  $  .  #

Regex.Match ("what?", @"what\?").Value.Dump ("Correct");
Regex.Match ("what?", @"what?").Value.Dump ("Incorrect");

Regex.Escape   (@"?").Dump ("Escape");
Regex.Unescape (@"\?").Dump ("Unescape");

Regex.IsMatch ("hello world", @"hello world").Dump ("Are spaces significant?");
Regex.IsMatch ("hello world", @"(?x) hello world").Dump ("Are spaces are significant?");

Character Sets

Regex.Matches ("That is that.", "[Tt]hat").Count
  .Dump ("Matches any of a set");
  
Regex.Match ("quiz qwerty", "q[^aeiou]").Index
  .Dump ("Matches any except those of a set");
  
Regex.Match ("b1-c4", @"[a-h]\d-[a-h]\d").Success
  .Dump ("Matches a range");
  
Regex.IsMatch ("Yes, please", @"\p{P}")
  .Dump ("Matches character category");

Quantifiers

Regex.Match ("cv15.docx", @"cv\d*\.docx").Success.Dump();
Regex.Match ("cvjoint.docx", @"cv.*\.docx").Success.Dump();

Regex.Matches ("slow! yeah slooow!", "slo+w").Count.Dump();

Regex bp = new Regex (@"\d{2,3}/\d{2,3}");
bp.Match ("It used to be 160/110").Value.Dump();
bp.Match ("Now it's only 115/75").Value.Dump();

Greedy verus Lazy

string html = "<i>By default</i> quantifiers are <i>greedy</i> creatures";

foreach (Match m in Regex.Matches (html, @"<i>.*</i>"))
  m.Value.Dump ("Greedy");
  
foreach (Match m in Regex.Matches (html, @"<i>.*?</i>"))
  m.Value.Dump ("Lazy");

Zero Width Assertions

Lookahead

Regex.Match ("say 25 miles more", @"\d+\s(?=miles)").Value.Dump();
Regex.Match ("say 25 miles more", @"\d+\s(?=miles).*").Value.Dump();

string password = "blahblah3";
Regex.IsMatch (password, @"(?=.*\d).{6,}").Dump ("Password is strong");

password = "blahblaha";
Regex.IsMatch (password, @"(?=.*\d).{6,}").Dump ("Password is strong");

string regex = "(?i)good(?!.*(however|but))";
Regex.IsMatch ("Good work! But...",  regex).Dump ("Negative lookahead");
Regex.IsMatch ("Good work! Thanks!", regex).Dump ("Negative lookahead");

string fileNames = "a.txt" + "\r\n" + "b.docx" + "\r\n" + "c.txt";
string r = @".+\.txt(?=\r?$)";
foreach (Match m in Regex.Matches (fileNames, r, RegexOptions.Multiline))
  Console.Write (m + " ");

Lookbehind

string regex = "(?i)(?<!however.*)good";

Regex.IsMatch ("However good, we...", regex).Dump();
Regex.IsMatch ("Very good, thanks!" , regex).Dump();

Anchors

Regex.Match ("Not now", "^[Nn]o").Value.Dump();
Regex.Match ("f = 0.2F", "[Ff]$").Value.Dump();

Anchors - Handling End of Lines

string fileNames = "a.txt" + "\r\n" + "b.doc" + "\r\n" + "c.txt";
string r = @".+\.txt(?=\r?$)";

foreach (Match m in Regex.Matches (fileNames, r, RegexOptions.Multiline))
  Console.Write (m + " ");

Anchors - Empty Lines

string s = @"The
second to last line

has some

spaces
   
in it!";

MatchCollection emptyLines = Regex.Matches (s, "^(?=\r?$)", RegexOptions.Multiline);
emptyLines.Count.Dump();

MatchCollection blankLines = Regex.Matches (s, "^[ \t]*(?=\r?$)", RegexOptions.Multiline);
blankLines.Count.Dump();

Word Boundaries

foreach (Match m in Regex.Matches ("Wedding in Sarajevo", @"\b\w+\b"))
  m.Value.Dump();

Regex.Matches ("Wedding in Sarajevo", @"\bin\b").Count.Dump ("With the word boundary operator");
Regex.Matches ("Wedding in Sarajevo", @"in").Count.Dump ("Without the word boundary operator");

string text = "Don't loose (sic) your cool";
Regex.Match (text, @"\b\w+\b\s(?=\(sic\))").Value.Dump();

Groups

Match m = Regex.Match ("206-465-1918", @"(\d{3})-(\d{3}-\d{4})");

m.Groups[0].Value.Dump();
m.Groups[1].Value.Dump();
m.Groups[2].Value.Dump();

Console.WriteLine();

foreach (Match ma in Regex.Matches ("pop pope peep", @"\b(\w)\w+\1\b"))
  Console.Write (ma + " ");

Named Groups

string regEx =
  @"\b"             +  // word boundary
  @"(?'letter'\w)"  +  // match first letter, and name it 'letter'
  @"\w+"            +  // match middle letters
  @"\k'letter'"     +  // match last letter, denoted by 'letter'
  @"\b";               // word boundary

foreach (Match m in Regex.Matches ("bob pope peep", regEx))
  Console.Write (m + " ");

Named Groups - XML tag

string regFind = 
  @"<(?'tag'\w+?).*>" +  // match first tag, and name it 'tag'
  @"(?'text'.*?)"     +  // match text content, name it 'text'
  @"</\k'tag'>";         // match last tag, denoted by 'tag'

Match m = Regex.Match ("<h1>hello</h1>", regFind);
m.Groups ["tag"].Value.Dump();
m.Groups ["text"].Value.Dump();

Replacing and Splitting Text

Simple Replacement

string find = @"\bcat\b";
string replace = "dog";
Regex.Replace ("catapult the cat", find, replace).Dump();

Referencing the Original String

string text = "10 plus 20 makes 30";
Regex.Replace (text, @"\d+", @"<$0>").Dump();

Updating an XML tag

string regFind = 
  @"<(?'tag'\w+?).*>" +  // match first tag, and name it 'tag'
  @"(?'text'.*?)"     +  // match text content, name it 'text'
  @"</\k'tag'>";         // match last tag, denoted by 'tag'

string regReplace =
  @"<${tag}"         +  // <tag
  @" value="""       +  // value="
  @"${text}"         +  // text
  @"""/>";              // "/>

Regex.Replace ("<msg>hello</msg>", regFind, regReplace).Dump();

Using MatchEvaluator

Regex.Replace (
  "5 is less than 10",
   @"\d+",
   m => (int.Parse (m.Value) * 10).ToString()
)

Splitting Text

foreach (string s in Regex.Split ("a5b7c", @"\d"))
  Console.Write (s + " ");
  
Console.WriteLine();
  
foreach (string s in Regex.Split ("oneTwoThree", @"(?=[A-Z])"))
  Console.Write (s + " ");

Regex Cookbook

Matching a US Phone or Social Security Number

string ssNum = @"\d{3}-\d{2}-\d{4}";

Console.WriteLine (Regex.IsMatch ("123-45-6789", ssNum));      // True

string phone = @"(?x)
  ( \d{3}[-\s] | \(\d{3}\)\s? )
    \d{3}[-\s]?
    \d{4}";

Console.WriteLine (Regex.IsMatch ("123-456-7890",   phone));   // True
Console.WriteLine (Regex.IsMatch ("(123) 456-7890", phone));   // True

Extracting Name=Value pairs

string r = @"(?m)^\s*(?'name'\w+)\s*=\s*(?'value'.*)\s*(?=\r?$)";

string text =
  @"id = 3
    secure = true
    timeout = 30";

foreach (Match m in Regex.Matches (text, r))
  Console.WriteLine (m.Groups["name"] + " is " + m.Groups["value"]);

Strong Password Validation

string r =
  @"(?x)" +                           // Ignore spaces within regex expression, for readability
  @"^"    +                           // Anchor at start of string
  @"(?=.* ( \d | \p{P} | \p{S} ))" +  // String must contain a digit or punctuation char or symbol
  @".{6,}";                           // String must be at least 6 characters in length

Console.WriteLine (Regex.IsMatch ("abc12", r));
Console.WriteLine (Regex.IsMatch ("abcdef", r));
Console.WriteLine (Regex.IsMatch ("ab88yz", r));

Lines at least n characters

string r = @"(?m)^.{80,}(?=\r?$)";

string fifty = new string ('x', 50);
string eighty = new string ('x', 80);

string text = eighty + "\r\n" + fifty + "\r\n" + eighty;

Console.WriteLine (Regex.Matches (text, r).Count);

Parsing Dates and Times

string r = @"(?x)(?i)
 (\d{1,4}) [./-]
 (\d{1,2}) [./-]
 (\d{1,4}) [\sT]  (\d+):(\d+):(\d+) \s? (A\.?M\.?|P\.?M\.?)?";

string text = "01/02/2008 5:20:50 PM";

foreach (Group g in Regex.Match (text, r).Groups)
  Console.WriteLine (g.Value + " ");

Matching Roman Numerals

string r =
  @"(?i)\bm*"         +
  @"(d?c{0,3}|c[dm])" +
  @"(l?x{0,3}|x[lc])" +
  @"(v?i{0,3}|i[vx])" +
  @"\b";

Console.WriteLine (Regex.IsMatch ("MCMLXXXIV", r));

Removing Repeated Words

string r = @"(?'dupe'\w+)\W\k'dupe'";

string text = "In the the beginning...";
Console.WriteLine (Regex.Replace (text, r, "${dupe}"));

Replacing newline with return-newline

// This replaces \n with \r\n without breaking existing \r\n occurrences.

string n = "\n";
string rn = "\r\n";
string text = "L1" + n + "L2" + rn + "L3";

string result = Regex.Replace (text, "(?<!\r)\n", "\r\n");

result.Select (c => new { c, Code = (int) c } ).Dump();

Word Count

string r = @"\b(\w|[-'])+\b";

string text = "It's all mumbo-jumbo to me";
Console.WriteLine (Regex.Matches (text, r).Count);

Matching a GUID

string r =
  @"(?i)\b"           +
  @"[0-9a-fA-F]{8}\-" +
  @"[0-9a-fA-F]{4}\-" +
  @"[0-9a-fA-F]{4}\-" +
  @"[0-9a-fA-F]{4}\-" +
  @"[0-9a-fA-F]{12}"  +
  @"\b";

string text = "Its key is {3F2504E0-4F89-11D3-9A0C-0305E82C3301}.";
Console.WriteLine (Regex.Match (text, r).Index);

Parsing an XML tag

string r = 
  @"<(?'tag'\w+?).*>" +    // match first tag, and name it 'tag'
  @"(?'text'.*?)" +        // match text content, name it 'text'
  @"</\k'tag'>";           // match last tag, denoted by 'tag'

string text = "<h1>hello</h1>";

Match m = Regex.Match (text, r);

Console.WriteLine (m.Groups ["tag"].Value);
Console.WriteLine (m.Groups ["text"].Value);

Splitting a Camel-Cased Word

string r = @"(?=[A-Z])";

foreach (string s in Regex.Split ("oneTwoThree", r))
  Console.Write (s + " ");

Obtaining a Legal Filename

string input = "My \"good\" <recipes>.txt";

char[] invalidChars = System.IO.Path.GetInvalidFileNameChars();
string invalidString = Regex.Escape (new string (invalidChars));

string valid = Regex.Replace (input, "[" + invalidString + "]", "");
Console.WriteLine (valid);

Escaping Unicode Characters for HTML

string htmlFragment = "© 2007";

string result = Regex.Replace (
  htmlFragment,
  @"[\u0080-\uFFFF]",
  m => @"&#" + ((int)m.Value[0]).ToString() + ";");

Console.WriteLine (result);

Unescaping Characters in an HTTP Query String

string sample = "C%23 in a Nutshell";

string result = Regex.Replace (
  sample,
  @"%[0-9a-f][0-9a-f]", 
  m => ((char) Convert.ToByte (m.Value.Substring (1), 16)).ToString(),
  RegexOptions.IgnoreCase
);

Console.WriteLine (result);

Parsing Google Search Terms from a Web Stats Log

string sample = "http://www.google.com/search?hl=en&q=greedy+quantifiers+regex&btnG=Search";

Match m = Regex.Match (sample, @"(?<=google\..+search\?.*q=).+?(?=(&|$))");

string[] keywords = m.Value.Split (new[] { '+' }, StringSplitOptions.RemoveEmptyEntries);
keywords.Dump();

// Note: this may need to be used in conunction with the previous
// example, i.e. "Unescaping Characters in an HTTP Query String".