is_file = FALSE; if (!empty($filename)) { $this->file = $filename; $this->sep = self::get_separator($filename); $file_handle = @fopen($filename, 'r'); if ($file_handle) { $this->is_file = TRUE; $this->file_handle = $file_handle; $this->nc = fgetc($this->file_handle); // skip junk at start for ($i = 0; $i < strlen($skip); $i++) { if ($this->nc !== $skip[$i]) break; $this->nc = fgetc($this->file_handle); } $this->eof = ($this->nc === FALSE); } } } public function closeFilePointer() { fclose( $this->file_handle ); } public function get_data() { $first = true; $i = 0; $data = array(); while (( $line = $this->get_single_row() ) !== NULL) { // If the first line is empty, abort // If another line is empty, just skip it if (empty($line)) { if ($first) { break; } else { continue; } } // If we are on the first line, the columns are the headers if ($first) { $headers = $line; $first = false; continue; } // Separate user data from meta foreach ($line as $ckey => $column) { $column_name = $headers[$ckey]; $column = trim($column); $data[$i][$column_name] = $column; } $i++; } return $data; } public function get_single_row() { if ($this->eof) return NULL; $row = array(); $field = ""; $state = self::field_start; while (1) { $char = $this->next_char(); if ($state == self::quoted_field) { if ($char === FALSE) { // EOF. (TODO: error case - no closing quote) $row[] = $field; return $row; } // Fall through to accumulate quoted chars in switch() {...} } elseif ($char === FALSE || $char == "\n") { // End of record. // (TODO: error case if $state==self::field_start here - trailing comma) $row[] = $field; return $row; } elseif ($char == "\r") { // Possible start of \r\n line end, but might be just part of foo\rbar $state = ($state == self::found_quote) ? self::found_cr_q : self::found_cr; continue; } elseif ($char == $this->sep && ($state == self::field_start || $state == self::found_quote || $state == self::unquoted_field)) { // End of current field, start of next field $row[] = $field; $field = ""; $state = self::field_start; continue; } switch ($state) { case self::field_start: if ($char == '"') $state = self::quoted_field; else { $state = self::unquoted_field; $field .= $char; } break; case self::quoted_field: if ($char == '"') $state = self::found_quote; else $field .= $char; break; case self::unquoted_field: $field .= $char; // (TODO: error case if '"' in middle of unquoted field) break; case self::found_quote: // Found '"' escape sequence $field .= $char; $state = self::quoted_field; // (TODO: error case if $char!='"' - non-separator char after single quote) break; case self::found_cr: // Lone \rX instead of \r\n. Treat as literal \rX. (TODO: error case?) $field .= "\r" . $char; $state = self::unquoted_field; break; case self::found_cr_q: // (TODO: error case: "foo"\rX instead of "foo"\r\n or "foo"\n) $field .= "\r" . $char; $state = self::quoted_field; break; } } } public function next_char() { $c = $this->nc; $this->nc = fgetc($this->file_handle); $this->eof = ($this->nc === FALSE); return $c; } public function get_separator($file) { $file_detail = self::analyse_file($file); $separator = $file_detail['delimiter']['value']; return $separator; } public function analyse_file($file, $capture_limit_in_kb = 100) { // capture starting memory usage $output['peak_mem']['start'] = memory_get_peak_usage(true); // log the limit how much of the file was sampled (in Kb) $output['read_kb'] = $capture_limit_in_kb; // read in file $fh = fopen($file, 'r'); $contents = fread($fh, ($capture_limit_in_kb * 1024)); // in KB fclose($fh); // specify allowed field delimiters $delimiters = array( 'comma' => ',', 'semicolon' => ';', 'tab' => "\t", 'pipe' => '|', 'colon' => ':' ); // specify allowed line endings $line_endings = array( 'rn' => "\r\n", 'n' => "\n", 'r' => "\r", 'nr' => "\n\r" ); // loop and count each line ending instance foreach ($line_endings as $key => $value) { $line_result[$key] = substr_count($contents, $value); } // sort by largest array value asort($line_result); // log to output array $output['line_ending']['results'] = $line_result; $output['line_ending']['count'] = end($line_result); $output['line_ending']['key'] = key($line_result); $output['line_ending']['value'] = $line_endings[$output['line_ending']['key']]; $lines = explode($output['line_ending']['value'], $contents); // remove last line of array, as this maybe incomplete? array_pop($lines); // create a string from the legal lines $complete_lines = implode(' ', $lines); // log statistics to output array $output['lines']['count'] = count($lines); $output['lines']['length'] = strlen($complete_lines); // loop and count each delimiter instance foreach ($delimiters as $delimiter_key => $delimiter) { $delimiter_result[$delimiter_key] = substr_count($complete_lines, $delimiter); } // sort by largest array value asort($delimiter_result); // log statistics to output array with largest counts as the value $output['delimiter']['results'] = $delimiter_result; $output['delimiter']['count'] = end($delimiter_result); $output['delimiter']['key'] = key($delimiter_result); $output['delimiter']['value'] = $delimiters[$output['delimiter']['key']]; // capture ending memory usage $output['peak_mem']['end'] = memory_get_peak_usage(true); return $output; } }