jcseg
/
jcseg-server.properties

# jcseg server configuration file with standard json syntax
# question or report to chenxin<chenxin619315@gmail.com>
{
    # jcseg server configuration
    "server_config": {
        # server port
        "port": 1990,

        # default conmunication charset
        "charset": "utf-8",

        # http idle timeout in ms
        "http_connection_idle_timeout": 60000,

        # jetty maximum thread pool size
        "max_thread_pool_size": 200,

        # thread idle timeout in ms
        "thread_idle_timeout": 30000,

        # http output buffer size
        "http_output_buffer_size": 32768,

        # request header size
        "http_request_header_size": 8192,

        # response header size
        "http_response_header_size": 8192
    },


    # global setting for jcseg, yet another copy of the old
    # configuration file jcseg.properties
    "jcseg_global_config": {
        # maximum match length. (5-7)
        "jcseg_maxlen": 7,

        # recognized the chinese name.
        # (true to open and false to close it)
        "jcseg_icnname": true,

        # maximum chinese word number of english chinese mixed word.
        "jcseg_mixcnlen": 3,

        # maximum length for pair punctuation text.
        # set it to 0 to close this function
        "jcseg_pptmaxlen": 7,

        # maximum length for chinese last name andron.
        "jcseg_cnmaxlnadron": 1,

        # Wether to clear the stopwords.
        # (set true to clear stopwords and false to close it)
        "jcseg_clearstopword": false,

        # Wether to convert the chinese numeric to arabic number.
        # (set to true open it and false to close it) like '\u4E09\u4E07' to 30000.
        "jcseg_cnnumtoarabic": true,

        # Wether to convert the chinese fraction to arabic fraction.
        # @Note: for lucene,solr,elasticsearch eg.. close it.
        "jcseg_cnfratoarabic": false,

        # Wether to keep the unrecognized word.
        # (set true to keep unrecognized word and false to clear it)
        "jcseg_keepunregword": true,

        # Wether to start the secondary segmentation for the complex english words.
        "jcseg_ensencondseg": true,

        # min length of the secondary simple token.
        # (better larger than 1)
        "jcseg_stokenminlen": 2,

        #thrshold for chinese name recognize.
        # better not change it before you know what you are doing.
        "jcseg_nsthreshold": 1000000,

        #The punctuations that will be keep in an token.
        # (Not the end of the token).
        "jcseg_keeppunctuations": "@#%.&+"
    },

    # dictionary instance setting.
    # add yours here with standard json syntax
    "jcseg_dict": {
        "master": {
            # set to null to load the lexicons from the classpath
            path: null,
            #"path": [
            #    "{jar.dir}/lexicon"
            #    # absolute path here
            #    # "/java/JavaSE/jcseg/lexicon"
            #],

            # wether to load the part of speech of the words
            "loadpos": true,

			# Wether to load the pinyin of the words.
			"loadpinyin": true,

			# Wether to load the synoyms words of the words.
			"loadsyn": true,

            # Wether to load the modified lexicon file auto.
            "autoload": true,

            # Poll time for auto load. (in seconds)
            "polltime": 300
        }

        # add more of yours here
        # ,"name" : {
        #   "path": [
        #       "absolute jcseg standard lexicon path 1",
        #       "absolute jcseg standard lexicon path 2"
        #       ...
        #   ],
        #   "autoload": 0,
        #   "polltime": 300
        # }
    },

    # JcsegTaskConfig instance setting.
    # @Note:
    # All the config instance here is extends from the global_setting above.
    # do nothing will extends all the setting from global_setting
    "jcseg_config": {
        "master": {
            # extends and Override the global setting
            "jcseg_pptmaxlen": 0,
            "jcseg_cnfratoarabic": true,
            "jcseg_keepunregword": false
        }

        # this one is for keywords,keyphrase,sentence,summary extract
        # @Note: do not delete this instance if u want jcseg to
        # offset u extractor service
        ,"extractor": {
            "jcseg_pptmaxlen": 0,
            "jcseg_clearstopword": true,
            "jcseg_cnnumtoarabic": false,
            "jcseg_cnfratoarabic": false,
            "jcseg_keepunregword": false,
            "jcseg_ensencondseg": false
        }

        # add more of yours here
        # ,"name": {
        #   ...
        # }
    },

    # jcseg tokenizer instance setting.
    # Your could let the instance service for you by access:
    # http://jcseg_server_host:port/tokenizer/instance_name
    # instance_name is the name of instance you define here.
    "jcseg_tokenizer": {
        "master": {
            # jcseg tokenizer algorithm, could be:
            # 1: SIMPLE_MODE
            # 2: COMPLEX_MODE
            # 3: DETECT_MODE
            # see org.lionsoul.jcseg.tokenizer.core.JcsegTaskConfig for more info
            "algorithm": 2,

            # dictionary instance name
            # choose one of your defines above in the dict scope
            "dict": "master",

            # JcsegTaskConfig instance name
            # choose one of your defines above in the config scope
            "config": "master"
        }

        # this tokenizer instance is for extractor service
        # do not delete it if you want jcseg to offset you extractor service
        ,"extractor": {
            "algorithm": 2,
            "dict": "master",
            "config": "extractor"
        }

        # add more of your here
        # ,"name": {
        #   ...
        # }
    }
}