Skip to content

Segment v3.24.0+

The pinyin-pro library exports a segment function for word segmentation output.

TIP

To ensure the accuracy of word segmentation, it is necessary to first add a comprehensive dictionary, such as @pinyin-pro/data/complete or @pinyin-pro/data/modern, using the addDict API before utilizing the segment API.

Example

Basic Usage

js
import { segment, addDict } from 'pinyin-pro';
import CompleteDict from '@pinyin-pro/data/complete';

addDict(CompleteDict);

const result = segment('小明硕士毕业于中国科学院计算所,后在日本京都大学深造');

// Conversion result:
[
  { origin: '小', result: 'xiǎo' },
  { origin: '明', result: 'míng' },
  { origin: '硕士', result: 'shuòshì' },
  { origin: '毕业', result: 'bìyè' },
  { origin: '于', result: 'yú' },
  { origin: '中国科学院', result: 'zhōngguókēxuéyuàn' },
  { origin: '计算所', result: 'jìsuànsuǒ' },
  { origin: ',', result: ',' },
  { origin: '后', result: 'hòu' },
  { origin: '在', result: 'zài' },
  { origin: '日本京都大学', result: 'rìběnjīngdūdàxué' },
  { origin: '深造', result: 'shēnzào' },
];
import { segment, addDict } from 'pinyin-pro';
import CompleteDict from '@pinyin-pro/data/complete';

addDict(CompleteDict);

const result = segment('小明硕士毕业于中国科学院计算所,后在日本京都大学深造');

// Conversion result:
[
  { origin: '小', result: 'xiǎo' },
  { origin: '明', result: 'míng' },
  { origin: '硕士', result: 'shuòshì' },
  { origin: '毕业', result: 'bìyè' },
  { origin: '于', result: 'yú' },
  { origin: '中国科学院', result: 'zhōngguókēxuéyuàn' },
  { origin: '计算所', result: 'jìsuànsuǒ' },
  { origin: ',', result: ',' },
  { origin: '后', result: 'hòu' },
  { origin: '在', result: 'zài' },
  { origin: '日本京都大学', result: 'rìběnjīngdūdàxué' },
  { origin: '深造', result: 'shēnzào' },
];

Different Output Formats

Specify different output formats using the format parameter.

AllSegment

js
import { segment, OutputFormat, addDict } from 'pinyin-pro';
import CompleteDict from '@pinyin-pro/data/complete';

addDict(CompleteDict);

const result = segment('小明硕士毕业于中国科学院计算所,后在日本京都大学深造', {
  format: OutputFormat.AllSegment,
});

// Conversion result:
[
  { origin: '小', result: 'xiǎo' },
  { origin: '明', result: 'míng' },
  { origin: '硕士', result: 'shuòshì' },
  { origin: '毕业', result: 'bìyè' },
  { origin: '于', result: 'yú' },
  { origin: '中国科学院', result: 'zhōngguókēxuéyuàn' },
  { origin: '计算所', result: 'jìsuànsuǒ' },
  { origin: ',', result: ',' },
  { origin: '后', result: 'hòu' },
  { origin: '在', result: 'zài' },
  { origin: '日本京都大学', result: 'rìběnjīngdūdàxué' },
  { origin: '深造', result: 'shēnzào' },
];
import { segment, OutputFormat, addDict } from 'pinyin-pro';
import CompleteDict from '@pinyin-pro/data/complete';

addDict(CompleteDict);

const result = segment('小明硕士毕业于中国科学院计算所,后在日本京都大学深造', {
  format: OutputFormat.AllSegment,
});

// Conversion result:
[
  { origin: '小', result: 'xiǎo' },
  { origin: '明', result: 'míng' },
  { origin: '硕士', result: 'shuòshì' },
  { origin: '毕业', result: 'bìyè' },
  { origin: '于', result: 'yú' },
  { origin: '中国科学院', result: 'zhōngguókēxuéyuàn' },
  { origin: '计算所', result: 'jìsuànsuǒ' },
  { origin: ',', result: ',' },
  { origin: '后', result: 'hòu' },
  { origin: '在', result: 'zài' },
  { origin: '日本京都大学', result: 'rìběnjīngdūdàxué' },
  { origin: '深造', result: 'shēnzào' },
];

AllArray

js
import { segment, OutputFormat, addDict } from 'pinyin-pro';
import CompleteDict from '@pinyin-pro/data/complete';

addDict(CompleteDict);

const result = segment('小明硕士毕业于中国科学院计算所,后在日本京都大学深造', {
  format: OutputFormat.AllArray,
});

// Conversion result:
[
  [{ origin: '小', result: 'xiǎo' }],
  [{ origin: '明', result: 'míng' }],
  [
    { origin: '硕', result: 'shuò' },
    { origin: '士', result: 'shì' },
  ],
  [
    { origin: '毕', result: 'bì' },
    { origin: '业', result: 'yè' },
  ],
  [{ origin: '于', result: 'yú' }],
  [
    { origin: '中', result: 'zhōng' },
    { origin: '国', result: 'guó' },
    { origin: '科', result: 'kē' },
    { origin: '学', result: 'xué' },
    { origin: '院', result: 'yuàn' },
  ],
  [
    { origin: '计', result: 'jì' },
    { origin: '算', result: 'suàn' },
    { origin: '所', result: 'suǒ' },
  ],
  [{ origin: ',', result: ',' }],
  [{ origin: '后', result: 'hòu' }],
  [{ origin: '在', result: 'zài' }],
  [
    { origin: '日', result: 'rì' },
    { origin: '本', result: 'běn' },
    { origin: '京', result: 'jīng' },
    { origin: '都', result: 'dū' },
    { origin: '大', result: 'dà' },
    { origin: '学', result: 'xué' },
  ],
  [
    { origin: '深', result: 'shēn' },
    { origin: '造', result: 'zào' },
  ],
];
import { segment, OutputFormat, addDict } from 'pinyin-pro';
import CompleteDict from '@pinyin-pro/data/complete';

addDict(CompleteDict);

const result = segment('小明硕士毕业于中国科学院计算所,后在日本京都大学深造', {
  format: OutputFormat.AllArray,
});

// Conversion result:
[
  [{ origin: '小', result: 'xiǎo' }],
  [{ origin: '明', result: 'míng' }],
  [
    { origin: '硕', result: 'shuò' },
    { origin: '士', result: 'shì' },
  ],
  [
    { origin: '毕', result: 'bì' },
    { origin: '业', result: 'yè' },
  ],
  [{ origin: '于', result: 'yú' }],
  [
    { origin: '中', result: 'zhōng' },
    { origin: '国', result: 'guó' },
    { origin: '科', result: 'kē' },
    { origin: '学', result: 'xué' },
    { origin: '院', result: 'yuàn' },
  ],
  [
    { origin: '计', result: 'jì' },
    { origin: '算', result: 'suàn' },
    { origin: '所', result: 'suǒ' },
  ],
  [{ origin: ',', result: ',' }],
  [{ origin: '后', result: 'hòu' }],
  [{ origin: '在', result: 'zài' }],
  [
    { origin: '日', result: 'rì' },
    { origin: '本', result: 'běn' },
    { origin: '京', result: 'jīng' },
    { origin: '都', result: 'dū' },
    { origin: '大', result: 'dà' },
    { origin: '学', result: 'xué' },
  ],
  [
    { origin: '深', result: 'shēn' },
    { origin: '造', result: 'zào' },
  ],
];

AllString

js
import { segment, OutputFormat, addDict } from 'pinyin-pro';
import CompleteDict from '@pinyin-pro/data/complete';

addDict(CompleteDict);

const result = segment('小明硕士毕业于中国科学院计算所,后在日本京都大学深造', {
  format: OutputFormat.AllString,
});

// Conversion result:
{
  origin: "小 明 硕士 毕业 于 中国科学院 计算所 , 后 在 日本京都大学 深造",
  result: "xiǎo míng shuòshì bìyè yú zhōngguókēxuéyuàn jìsuànsuǒ , hòu zài rìběnjīngdūdàxué shēnzào",
}
import { segment, OutputFormat, addDict } from 'pinyin-pro';
import CompleteDict from '@pinyin-pro/data/complete';

addDict(CompleteDict);

const result = segment('小明硕士毕业于中国科学院计算所,后在日本京都大学深造', {
  format: OutputFormat.AllString,
});

// Conversion result:
{
  origin: "小 明 硕士 毕业 于 中国科学院 计算所 , 后 在 日本京都大学 深造",
  result: "xiǎo míng shuòshì bìyè yú zhōngguókēxuéyuàn jìsuànsuǒ , hòu zài rìběnjīngdūdàxué shēnzào",
}

PinyinSegment

js
import { segment, OutputFormat, addDict } from 'pinyin-pro';
import CompleteDict from '@pinyin-pro/data/complete';

addDict(CompleteDict);

const result = segment('小明硕士毕业于中国科学院计算所,后在日本京都大学深造', {
  format: OutputFormat.PinyinSegment,
});

// Conversion result:
[
  'xiǎo',
  'míng',
  'shuòshì',
  'bìyè',
  'yú',
  'zhōngguókēxuéyuàn',
  'jìsuànsuǒ',
  ',',
  'hòu',
  'zài',
  'rìběnjīngdūdàxué',
  'shēnzào',
];
import { segment, OutputFormat, addDict } from 'pinyin-pro';
import CompleteDict from '@pinyin-pro/data/complete';

addDict(CompleteDict);

const result = segment('小明硕士毕业于中国科学院计算所,后在日本京都大学深造', {
  format: OutputFormat.PinyinSegment,
});

// Conversion result:
[
  'xiǎo',
  'míng',
  'shuòshì',
  'bìyè',
  'yú',
  'zhōngguókēxuéyuàn',
  'jìsuànsuǒ',
  ',',
  'hòu',
  'zài',
  'rìběnjīngdūdàxué',
  'shēnzào',
];

PinyinArray

js
import { segment, OutputFormat, addDict } from 'pinyin-pro';
import CompleteDict from '@pinyin-pro/data/complete';

addDict(CompleteDict);

const result = segment('小明硕士毕业于中国科学院计算所,后在日本京都大学深造', {
  format: OutputFormat.PinyinArray,
});

// Conversion result:
[
  ['xiǎo'],
  ['míng'],
  ['shuò', 'shì'],
  ['bì', 'yè'],
  ['yú'],
  ['zhōng', 'guó', 'kē', 'xué', 'yuàn'],
  ['jì', 'suàn', 'suǒ'],
  [','],
  ['hòu'],
  ['zài'],
  ['rì', 'běn', 'jīng', 'dū', 'dà', 'xué'],
  ['shēn', 'zào'],
];
import { segment, OutputFormat, addDict } from 'pinyin-pro';
import CompleteDict from '@pinyin-pro/data/complete';

addDict(CompleteDict);

const result = segment('小明硕士毕业于中国科学院计算所,后在日本京都大学深造', {
  format: OutputFormat.PinyinArray,
});

// Conversion result:
[
  ['xiǎo'],
  ['míng'],
  ['shuò', 'shì'],
  ['bì', 'yè'],
  ['yú'],
  ['zhōng', 'guó', 'kē', 'xué', 'yuàn'],
  ['jì', 'suàn', 'suǒ'],
  [','],
  ['hòu'],
  ['zài'],
  ['rì', 'běn', 'jīng', 'dū', 'dà', 'xué'],
  ['shēn', 'zào'],
];

PinyinString

js
import { segment, OutputFormat, addDict } from 'pinyin-pro';
import CompleteDict from '@pinyin-pro/data/complete';

addDict(CompleteDict);

const result = segment('小明硕士毕业于中国科学院计算所,后在日本京都大学深造', {
  format: OutputFormat.PinyinString,
});

// Conversion result:
('xiǎo míng shuòshì bìyè yú zhōngguókēxuéyuàn jìsuànsuǒ , hòu zài rìběnjīngdūdàxué shēnzào');
import { segment, OutputFormat, addDict } from 'pinyin-pro';
import CompleteDict from '@pinyin-pro/data/complete';

addDict(CompleteDict);

const result = segment('小明硕士毕业于中国科学院计算所,后在日本京都大学深造', {
  format: OutputFormat.PinyinString,
});

// Conversion result:
('xiǎo míng shuòshì bìyè yú zhōngguókēxuéyuàn jìsuànsuǒ , hòu zài rìběnjīngdūdàxué shēnzào');

ZhSegment

js
import { segment, OutputFormat, addDict } from 'pinyin-pro';
import CompleteDict from '@pinyin-pro/data/complete';

addDict(CompleteDict);

const result = segment('小明硕士毕业于中国科学院计算所,后在日本京都大学深造', {
  format: OutputFormat.ZhSegment,
});

// Conversion result:
[
  '小',
  '明',
  '硕士',
  '毕业',
  '于',
  '中国科学院',
  '计算所',
  ',',
  '后',
  '在',
  '日本京都大学',
  '深造',
];
import { segment, OutputFormat, addDict } from 'pinyin-pro';
import CompleteDict from '@pinyin-pro/data/complete';

addDict(CompleteDict);

const result = segment('小明硕士毕业于中国科学院计算所,后在日本京都大学深造', {
  format: OutputFormat.ZhSegment,
});

// Conversion result:
[
  '小',
  '明',
  '硕士',
  '毕业',
  '于',
  '中国科学院',
  '计算所',
  ',',
  '后',
  '在',
  '日本京都大学',
  '深造',
];

ZhArray

js
import { segment, OutputFormat, addDict } from 'pinyin-pro';
import CompleteDict from '@pinyin-pro/data/complete';

addDict(CompleteDict);

const result = segment('小明硕士毕业于中国科学院计算所,后在日本京都大学深造', {
  format: OutputFormat.ZhArray,
});

// Conversion result:
[
  ['小'],
  ['明'],
  ['硕', '士'],
  ['毕', '业'],
  ['于'],
  ['中', '国', '科', '学', '院'],
  ['计', '算', '所'],
  [','],
  ['后'],
  ['在'],
  ['日', '本', '京', '都', '大', '学'],
  ['深', '造'],
];
import { segment, OutputFormat, addDict } from 'pinyin-pro';
import CompleteDict from '@pinyin-pro/data/complete';

addDict(CompleteDict);

const result = segment('小明硕士毕业于中国科学院计算所,后在日本京都大学深造', {
  format: OutputFormat.ZhArray,
});

// Conversion result:
[
  ['小'],
  ['明'],
  ['硕', '士'],
  ['毕', '业'],
  ['于'],
  ['中', '国', '科', '学', '院'],
  ['计', '算', '所'],
  [','],
  ['后'],
  ['在'],
  ['日', '本', '京', '都', '大', '学'],
  ['深', '造'],
];

ZhString

js
import { segment, OutputFormat, addDict } from 'pinyin-pro';
import CompleteDict from '@pinyin-pro/data/complete';

addDict(CompleteDict);

const result = segment('小明硕士毕业于中国科学院计算所,后在日本京都大学深造', {
  format: OutputFormat.ZhString,
});

// Conversion result:
('小 明 硕士 毕业 于 中国科学院 计算所 , 后 在 日本京都大学 深造');
import { segment, OutputFormat, addDict } from 'pinyin-pro';
import CompleteDict from '@pinyin-pro/data/complete';

addDict(CompleteDict);

const result = segment('小明硕士毕业于中国科学院计算所,后在日本京都大学深造', {
  format: OutputFormat.ZhString,
});

// Conversion result:
('小 明 硕士 毕业 于 中国科学院 计算所 , 后 在 日本京都大学 深造');

Custom Separator

Customize the separator using the separator parameter. This only applies to the AllString, PinyinString, and ZhString output formats.

js
import { segment, OutputFormat, addDict } from 'pinyin-pro';
import CompleteDict from '@pinyin-pro/data/complete';

addDict(CompleteDict);

const result = segment('小明硕士毕业于中国科学院计算所,后在日本京都大学深造', {
  format: OutputFormat.AllString,
  separator: '/',
});

// Conversion result:
{
  origin: "小/明/硕士/毕业/于/中国科学院/计算所/,/后/在/日本京都大学/深造",
  result:
    "xiǎo/míng/shuòshì/bìyè/yú/zhōngguókēxuéyuàn/jìsuànsuǒ/,/hòu/zài/rìběnjīngdūdàxué/shēnzào",
}
import { segment, OutputFormat, addDict } from 'pinyin-pro';
import CompleteDict from '@pinyin-pro/data/complete';

addDict(CompleteDict);

const result = segment('小明硕士毕业于中国科学院计算所,后在日本京都大学深造', {
  format: OutputFormat.AllString,
  separator: '/',
});

// Conversion result:
{
  origin: "小/明/硕士/毕业/于/中国科学院/计算所/,/后/在/日本京都大学/深造",
  result:
    "xiǎo/míng/shuòshì/bìyè/yú/zhōngguókēxuéyuàn/jìsuànsuǒ/,/hòu/zài/rìběnjīngdūdàxué/shēnzào",
}

Syntax and Parameters

Except for the format and separator parameters, the functions of the other parameters are the same as those in the pinyin API.

ts
import { segment, OutputFormat } from 'pinyin-pro';

interface SegmentOptions {
  toneType?: 'symbol' | 'num' | 'none';
  removeNonZh?: boolean;
  nonZh?: 'spaced' | 'consecutive' | 'removed';
  v?: boolean;
  segmentit?: TokenizationAlgorithm;
  surname?: 'off' | 'head' | 'all';
  mode?: 'normal' | 'surname';
  toneSandhi?: boolean;
  nonZhScope?: RegExp;
  separator?: string;
  format?: OutputFormat;
}

function segment(text: string, options?: SegmentOptions);
import { segment, OutputFormat } from 'pinyin-pro';

interface SegmentOptions {
  toneType?: 'symbol' | 'num' | 'none';
  removeNonZh?: boolean;
  nonZh?: 'spaced' | 'consecutive' | 'removed';
  v?: boolean;
  segmentit?: TokenizationAlgorithm;
  surname?: 'off' | 'head' | 'all';
  mode?: 'normal' | 'surname';
  toneSandhi?: boolean;
  nonZhScope?: RegExp;
  separator?: string;
  format?: OutputFormat;
}

function segment(text: string, options?: SegmentOptions);