#!/usr/bin/perl # Copyright 2018 Google Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Generate spec_parser.h from sentencepiece_model.proto # # usage: ./gen_spec_parser.pl sentencepiece_model.proto > spec_parser.h use strict; use warnings; sub ProcessPrinter() { my ($filename) = @_; my $classname = ""; my $valid = 0; my %enum; open(F, $filename) || die; print "namespace {\n"; while () { chomp; if (/^\s*message (\S+)/) { $classname = $1; $valid = 0; if ($classname =~ /(TrainerSpec|NormalizerSpec)/) { print "inline std::string PrintProto(const $classname &message) {\n"; print " std::ostringstream os;\n\n"; print " os << \"$classname {\\n\";\n"; $valid = 1; } } elsif (/^\s*}/) { next if (!$valid); print " os << \"}\\n\";\n"; print "\n return os.str();\n"; print "}\n\n"; } elsif (/enum\s*(\S+)/) { my $name = $1; $enum{$name} = 1; next if (!$valid); print " static const std::map<$classname::$name, std::string> k${name}_Map = { "; while () { if (/(\S+)\s*=\s*(\d+)/) { print "{$classname::$1, \"$1\"}, "; } elsif (/}/) { print " };\n"; last; } } } elsif (/\s*(repeated|optional)\s+(\S+)\s+(\S+)\s*=\s*(\d+)/) { next if (/deprecated = true/); next if (!$valid); my $opt = $1; my $type = $2; my $name = $3; if ($type =~ /(int|double|float|bool|string)/) { if ($opt eq "optional") { print " os << \" ${name}: \" << message.${name}() << \"\\n\";\n"; } else { print " for (const auto &v : message.${name}())\n"; print " os << \" ${name}: \" << v << \"\\n\";\n"; } } elsif (defined $enum{$type}) { if ($opt eq "optional") { print " {\n"; print " const auto it = k${type}_Map.find(message.${name}());\n"; print " if (it == k${type}_Map.end())\n"; print " os << \" ${name}: unknown\\n\";\n"; print " else\n"; print " os << \" ${name}: \" << it->second << \"\\n\";\n"; print " }\n"; } else { print " for (const auto &v : message.${name}()) {\n"; print " const auto it = k${type}_Map.find(v);\n"; print " if (it == k${type}_Map.end())\n"; print " os << \" ${name}: unknown\\n\";\n"; print " else\n"; print " os << \" ${name}: \" << it->second << \"\\n\";\n"; print " }\n"; } } } } print "} // namespace\n\n"; close(F); } sub ProcessParser() { my ($filename) = @_; my $classname = ""; my $valid = 0; my %enum; open(F, $filename) || die; while () { if (/^\s*message (\S+)/) { $classname = $1; $valid = 0; if ($classname =~ /(TrainerSpec|NormalizerSpec)/) { print "util::Status SentencePieceTrainer::SetProtoField(const std::string& name, const std::string& value, $classname *message) {\n"; print " CHECK_OR_RETURN(message);\n\n"; $valid = 1; } } elsif (/^\s*}/) { next if (!$valid); print " return util::StatusBuilder(util::error::NOT_FOUND)\n"; print " << \"unknown field name \\\"\" << name << \"\\\" in ${classname}.\";\n"; print "}\n\n"; } elsif (/enum\s*(\S+)/) { my $name = $1; $enum{$name} = 1; next if (!$valid); print " static const std::map k${name}_Map = { "; while () { if (/(\S+)\s*=\s*(\d+)/) { print "{\"$1\", $classname::$1}, "; } elsif (/}/) { print " };\n\n"; last; } } } elsif (/\s*(repeated|optional)\s+(\S+)\s+(\S+)\s*=\s*(\d+)/) { next if (/deprecated = true/); next if (!$valid); my $opt = $1; my $type = $2; my $name = $3; my $func_prefix = $opt eq "optional" ? "set_" : "add_"; my $body = ""; if ($type =~ /(int|double|float|bool)/) { my $empty = $type eq "bool" ? "\"true\"" : "\"\""; $body = "${type} v;\n" . " if (!string_util::lexical_cast(val.empty() ? ${empty} : val, &v))\n" . " return util::StatusBuilder(util::error::INVALID_ARGUMENT) << \"cannot parse \\\"\" << val << \"\\\" as ${type}.\";\n" . " message->${func_prefix}${name}(v);\n"; } elsif ($type =~ /string/) { $body = "message->${func_prefix}${name}(val);\n"; } elsif ($type =~ /bytes/) { $body = "message->${func_prefix}${name}(val.data(), val.size());\n"; } elsif (defined $enum{$type}) { $body = "const auto it = k${type}_Map.find(string_util::ToUpper(val));\n" . " if (it == k${type}_Map.end())\n" . " return util::StatusBuilder(util::error::INVALID_ARGUMENT) << \"unknown enumeration value of \\\"\" << val << \"\\\" as ${type}.\";\n" . " message->${func_prefix}${name}(it->second);\n"; } print " if (name == \"${name}\") {\n"; if ($opt eq "repeated") { print " for (const auto &val : string_util::Split(value, \",\")) {\n"; print " ${body}"; print " }\n"; } else { print " const auto &val = value;\n"; print " ${body}"; } print " return util::OkStatus();\n"; print " }\n\n"; } } close(F); } for my $file (@ARGV) { &ProcessPrinter($file); &ProcessParser($file); }