5d92f7fa38e405cd198558cc7330cfd4c57bb4ac
3 /* private */ var $mText, # Text to be processed by the tokenizer
4 $mPos, # current position of tokenizer in text
5 $mTextLength, # Length of $mText
6 $mQueuedToken; # Tokens that were already found, but not
9 /* private */ function Tokenizer()
12 $this->mTokenQueue
=array();
16 function newFromString( $s )
20 $t->mTextLength
= strlen( $s );
21 // echo "New tokenizer generated. <pre>{$s}</pre>\n";
26 // Return the next token, but do not increase the pointer. The next call
27 // to previewToken or nextToken will return the same token again.
28 // Actually, the pointer is increased, but the token is queued. The next
29 // call to previewToken or nextToken will check the queue and return
31 function previewToken()
33 if ( count( $this->mQueuedToken
) != 0 ) {
34 // still one token from the last round around. Return that one first.
35 $token = $this->mQueuedToken
[0];
37 $token = $this->nextToken();
38 array_unshift( $this->mQueuedToken
, $token );
45 // proceeds character by character through the text, looking for characters needing
46 // special attention. Those are currently: I, R, ', [, ], newline
48 // TODO: prefixed links for Arabic wikipedia not implemented yet
49 // handling of French blanks not yet implemented
52 if ( count( $this->mQueuedToken
) != 0 ) {
53 // still one token from the last round around. Return that one first.
54 $token = array_shift( $this->mQueuedToken
);
58 $token["type"]="text";
60 // If no text is left, return "false".
61 if ( $this->mPos
> $this->mTextLength
)
64 while ( $this->mPos
<= $this->mTextLength
) {
65 switch ( $ch = isset($this->mText
[$this->mPos
]) ?
$this->mText
[$this->mPos
] : '' ) {
66 case 'R': // for "RFC "
67 if ( $this->mText
[$this->mPos+
1] == 'F' &&
68 $this->mText
[$this->mPos+
2] == 'C' &&
69 $this->mText
[$this->mPos+
4] == ' ' ) {
70 $queueToken["type"] = $queueToken["text"] = "RFC ";
71 $this->mQueuedToken
[] = $queueToken;
73 break 2; // switch + while
76 case 'I': // for "ISBN "
77 if ( $this->mText
[$this->mPos+
1] == 'S' &&
78 $this->mText
[$this->mPos+
2] == 'B' &&
79 $this->mText
[$this->mPos+
3] == 'N' &&
80 $this->mText
[$this->mPos+
4] == ' ' ) {
81 $queueToken["type"] = $queueToken["text"] = "ISBN ";
82 $this->mQueuedToken
[] = $queueToken;
84 break 2; // switch + while
87 case "[": // for links "[["
88 if ( $this->mText
[$this->mPos+
1] == "[" &&
89 $this->mText
[$this->mPos+
2] == "[" ) {
90 $queueToken["type"] = "[[[";
91 $queueToken["text"] = "";
92 $this->mQueuedToken
[] = $queueToken;
94 break 2; // switch + while
95 } else if ( $this->mText
[$this->mPos+
1] == "[" ) {
96 $queueToken["type"] = "[[";
97 $queueToken["text"] = "";
98 $this->mQueuedToken
[] = $queueToken;
100 break 2; // switch + while
103 case "]": // for end of links "]]"
104 if ( $this->mText
[$this->mPos+
1] == "]" ) {
105 $queueToken["type"] = "]]";
106 $queueToken["text"] = "";
107 $this->mQueuedToken
[] = $queueToken;
109 break 2; // switch + while
112 case "'": // for all kind of em's and strong's
113 if ( $this->mText
[$this->mPos+
1] == "'" ) {
114 $queueToken["type"] = "'";
115 $queueToken["text"] = "";
116 while(isset($this->mText
[$this->mPos+
1]) && $this->mText
[$this->mPos+
1] == "'" ) {
117 $queueToken["type"] .= "'";
121 $this->mQueuedToken
[] = $queueToken;
123 break 2; // switch + while
126 case "\n": // for block levels, actually, only "----" is handled.
128 if ( isset($this->mText
[$this->mPos+
4]) &&
129 $this->mText
[$this->mPos+
1] == "-" &&
130 $this->mText
[$this->mPos+
2] == "-" &&
131 $this->mText
[$this->mPos+
3] == "-" &&
132 $this->mText
[$this->mPos+
4] == "-" ) {
133 $queueToken["type"] = "----";
134 $queueToken["text"] = "";
135 $this->mQueuedToken
[] = $queueToken;
137 while ($this->mText
[$this->mPos
] == "-" ) {
145 // echo $this->mPos . "<br>\n";
147 } /* if (nothing left in queue) */