NFA.java
6.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
/******************************************************************************
* Compilation: javac NFA.java
* Execution: java NFA regexp text
* Dependencies: Stack.java Bag.java Digraph.java DirectedDFS.java
*
* % java NFA "(A*B|AC)D" AAAABD
* true
*
* % java NFA "(A*B|AC)D" AAAAC
* false
*
* % java NFA "(a|(bc)*d)*" abcbcd
* true
*
* % java NFA "(a|(bc)*d)*" abcbcbcdaaaabcbcdaaaddd
* true
*
* Remarks
* -----------
* The following features are not supported:
* - The + operator
* - Multiway or
* - Metacharacters in the text
* - Character classes.
*
******************************************************************************/
package edu.princeton.cs.algs4;
/**
* The {@code NFA} class provides a data type for creating a
* <em>nondeterministic finite state automaton</em> (NFA) from a regular
* expression and testing whether a given string is matched by that regular
* expression.
* It supports the following operations: <em>concatenation</em>,
* <em>closure</em>, <em>binary or</em>, and <em>parentheses</em>.
* It does not support <em>mutiway or</em>, <em>character classes</em>,
* <em>metacharacters</em> (either in the text or pattern),
* <em>capturing capabilities</em>, <em>greedy</em> or <em>relucantant</em>
* modifiers, and other features in industrial-strength implementations
* such as {@link java.util.regex.Pattern} and {@link java.util.regex.Matcher}.
* <p>
* This implementation builds the NFA using a digraph and a stack
* and simulates the NFA using digraph search (see the textbook for details).
* The constructor takes time proportional to <em>m</em>, where <em>m</em>
* is the number of characters in the regular expression.
* The <em>recognizes</em> method takes time proportional to <em>m n</em>,
* where <em>n</em> is the number of characters in the text.
* <p>
* For additional documentation,
* see <a href="http://algs4.cs.princeton.edu/54regexp">Section 5.4</a> of
* <i>Algorithms, 4th Edition</i> by Robert Sedgewick and Kevin Wayne.
*
* @author Robert Sedgewick
* @author Kevin Wayne
*/
public class NFA {
private Digraph graph; // digraph of epsilon transitions
private String regexp; // regular expression
private final int m; // number of characters in regular expression
/**
* Initializes the NFA from the specified regular expression.
*
* @param regexp the regular expression
*/
public NFA(String regexp) {
this.regexp = regexp;
m = regexp.length();
Stack<Integer> ops = new Stack<Integer>();
graph = new Digraph(m+1);
for (int i = 0; i < m; i++) {
int lp = i;
if (regexp.charAt(i) == '(' || regexp.charAt(i) == '|')
ops.push(i);
else if (regexp.charAt(i) == ')') {
int or = ops.pop();
// 2-way or operator
if (regexp.charAt(or) == '|') {
lp = ops.pop();
graph.addEdge(lp, or+1);
graph.addEdge(or, i);
}
else if (regexp.charAt(or) == '(')
lp = or;
else assert false;
}
// closure operator (uses 1-character lookahead)
if (i < m-1 && regexp.charAt(i+1) == '*') {
graph.addEdge(lp, i+1);
graph.addEdge(i+1, lp);
}
if (regexp.charAt(i) == '(' || regexp.charAt(i) == '*' || regexp.charAt(i) == ')')
graph.addEdge(i, i+1);
}
if (ops.size() != 0)
throw new IllegalArgumentException("Invalid regular expression");
}
/**
* Returns true if the text is matched by the regular expression.
*
* @param txt the text
* @return {@code true} if the text is matched by the regular expression,
* {@code false} otherwise
*/
public boolean recognizes(String txt) {
DirectedDFS dfs = new DirectedDFS(graph, 0);
Bag<Integer> pc = new Bag<Integer>();
for (int v = 0; v < graph.V(); v++)
if (dfs.marked(v)) pc.add(v);
// Compute possible NFA states for txt[i+1]
for (int i = 0; i < txt.length(); i++) {
if (txt.charAt(i) == '*' || txt.charAt(i) == '|' || txt.charAt(i) == '(' || txt.charAt(i) == ')')
throw new IllegalArgumentException("text contains the metacharacter '" + txt.charAt(i) + "'");
Bag<Integer> match = new Bag<Integer>();
for (int v : pc) {
if (v == m) continue;
if ((regexp.charAt(v) == txt.charAt(i)) || regexp.charAt(v) == '.')
match.add(v+1);
}
dfs = new DirectedDFS(graph, match);
pc = new Bag<Integer>();
for (int v = 0; v < graph.V(); v++)
if (dfs.marked(v)) pc.add(v);
// optimization if no states reachable
if (pc.size() == 0) return false;
}
// check for accept state
for (int v : pc)
if (v == m) return true;
return false;
}
/**
* Unit tests the {@code NFA} data type.
*
* @param args the command-line arguments
*/
public static void main(String[] args) {
String regexp = "(" + args[0] + ")";
String txt = args[1];
NFA nfa = new NFA(regexp);
StdOut.println(nfa.recognizes(txt));
}
}
/******************************************************************************
* Copyright 2002-2016, Robert Sedgewick and Kevin Wayne.
*
* This file is part of algs4.jar, which accompanies the textbook
*
* Algorithms, 4th edition by Robert Sedgewick and Kevin Wayne,
* Addison-Wesley Professional, 2011, ISBN 0-321-57351-X.
* http://algs4.cs.princeton.edu
*
*
* algs4.jar is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* algs4.jar is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with algs4.jar. If not, see http://www.gnu.org/licenses.
******************************************************************************/