/*
 * Copyright 1995, 2003 Perforce Software.  All rights reserved.
 *
 * This file is part of Perforce - the FAST SCM System.
 */

#include "validate.h"

/*
 * ValidateCharSet
 */

CharSetValid::~CharSetValid()
{
}

CharSetUTF8Valid::CharSetUTF8Valid()
    : followcnt(0), magic(0)
{
}

void
CharSetUTF8Valid::Reset()
{
	followcnt = 0;
	magic = 0;
}

/*
 * What do these bits mean?
 *
 * 0x40 First byte of a multi-byte sequence, which includes trivial
 *        multi-byte sequences of length one (i.e. only this byte)
 * 0x80 Part of a multi byte sequence
 * 0x08 UTF-16 surrogate
 * 0x07 count of following bytes
 */

unsigned char
CharSetUTF8Valid::validmap[256] = {
// 0 - 0x7f
    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
// 0x80 - 0x8f
    0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0,
    0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0,
// 0x90 - 0x9f
    0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
    0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
// 0xa0 - 0xbf
    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
// 0xc0, 0xc1 illegal
    0, 0,
// 0xc2 - 0xdf
                0x41, 0x41, 0x41, 0x41, 0x41, 0x41,
    0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41,
    0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41,
    0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41,
// 0xe0 - 0xef  ( 0xe0 and 0xed are magical )
    0x72, 0x42, 0x42, 0x42, 0x42, 0x42, 0x42, 0x42,
    0x42, 0x42, 0x42, 0x42, 0x42, 0x4a, 0x42, 0x42,
// 0xf0 - 0xf4 ( 0xf0 and 0xf4 are magical )
    0x63, 0x43, 0x43, 0x43, 0x53,
// 0xf5 - 0xff illegal
                                  0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0
};

/*
 * return values are...
 *
 * 0 not valid
 * 1 valid
 * 3 valid so far (following bytes needed to complete a multi-byte char)
 */

int
CharSetUTF8Valid::Valid( const char *buf, int len, const char **retp )
{
	while( len-- > 0 )
	{
	    int chflags = validmap[0xff & *buf];

	    if( followcnt )
	    {
		if( ( chflags & 0x80 ) != 0x80 )
		    return 0;
		--followcnt;
		if( magic )
		{
		    switch( magic )
		    {
		    case 0x10:	// lead is 0xf4
			if( ( chflags & 0x20 ) != 0x20 )
			    return 0;
			break;
		    case 0x20:	// lead is 0xf0
			if( ( chflags & 0x20 ) == 0x20 )
			    return 0;
			break;
		    case 0x30:	// lead is 0xe0
			if( ( chflags & 0x10 ) == 0x10 )
			    return 0;
			break;
		    case 0x08:	// lead is 0xed (UTF-16 surrogates)
			if( ( chflags & 0x30 ) == 0x00 )
			    return 0;
			break;
		    }
		    magic = 0;
		}
	    }
	    else
	    {
		if( retp )
		    *retp = buf;
		if( ( chflags & 0x40 ) != 0x40 )
		    return 0;
		followcnt = chflags & 0x7;
		magic = chflags & 0x38;
	    }
	    buf++;
	}
	if( followcnt )
	    return 3;
	if( retp )
	    *retp = buf;
	return 1;
}
