Building a Smart Python-to-R Code Converter with Gemini AI-Powered Validation and Feedback


class EnhancedPythonToRConverter:
    """
    Enhanced Python to R converter with Gemini AI validation
    """


    def __init__(self, gemini_api_key: str = None):
        self.validator = GeminiValidator(gemini_api_key)


        self.import_mappings = {
            'pandas': 'library(dplyr)\nlibrary(tidyr)\nlibrary(readr)',
            'numpy': 'library(base)',
            'matplotlib.pyplot': 'library(ggplot2)',
            'seaborn': 'library(ggplot2)\nlibrary(RColorBrewer)',
            'scipy.stats': 'library(stats)',
            'sklearn': 'library(caret)\nlibrary(randomForest)\nlibrary(e1071)',
            'statsmodels': 'library(stats)\nlibrary(lmtest)',
            'plotly': 'library(plotly)',
        }


        self.function_mappings = {
            'pd.DataFrame': 'data.frame',
            'pd.read_csv': 'read.csv',
            'pd.read_excel': 'read_excel',
            'df.head': 'head',
            'df.tail': 'tail',
            'df.shape': 'dim',
            'df.info': 'str',
            'df.describe': 'summary',
            'df.mean': 'mean',
            'df.median': 'median',
            'df.std': 'sd',
            'df.var': 'var',
            'df.sum': 'sum',
            'df.count': 'length',
            'df.groupby': 'group_by',
            'df.merge': 'merge',
            'df.drop': 'select',
            'df.dropna': 'na.omit',
            'df.fillna': 'replace_na',
            'df.sort_values': 'arrange',
            'df.value_counts': 'table',


            'np.array': 'c',
            'np.mean': 'mean',
            'np.median': 'median',
            'np.std': 'sd',
            'np.var': 'var',
            'np.sum': 'sum',
            'np.min': 'min',
            'np.max': 'max',
            'np.sqrt': 'sqrt',
            'np.log': 'log',
            'np.exp': 'exp',
            'np.random.normal': 'rnorm',
            'np.random.uniform': 'runif',
            'np.linspace': 'seq',
            'np.arange': 'seq',


            'plt.figure': 'ggplot',
            'plt.plot': 'geom_line',
            'plt.scatter': 'geom_point',
            'plt.hist': 'geom_histogram',
            'plt.bar': 'geom_bar',
            'plt.boxplot': 'geom_boxplot',
            'plt.show': 'print',
            'sns.scatterplot': 'geom_point',
            'sns.histplot': 'geom_histogram',
            'sns.boxplot': 'geom_boxplot',
            'sns.heatmap': 'geom_tile',


            'scipy.stats.ttest_ind': 't.test',
            'scipy.stats.chi2_contingency': 'chisq.test',
            'scipy.stats.pearsonr': 'cor.test',
            'scipy.stats.spearmanr': 'cor.test',
            'scipy.stats.normaltest': 'shapiro.test',
            'stats.ttest_ind': 't.test',


            'sklearn.linear_model.LinearRegression': 'lm',
            'sklearn.ensemble.RandomForestRegressor': 'randomForest',
            'sklearn.model_selection.train_test_split': 'sample',
        }


        self.syntax_patterns = [
            (r'\bTrue\b', 'TRUE'),
            (r'\bFalse\b', 'FALSE'),
            (r'\bNone\b', 'NULL'),
            (r'\blen\(', 'length('),
            (r'range\((\d+)\)', r'1:\1'),
            (r'range\((\d+),\s*(\d+)\)', r'\1:\2'),
            (r'\.split\(', '.strsplit('),
            (r'\.strip\(\)', '.str_trim()'),
            (r'\.lower\(\)', '.str_to_lower()'),
            (r'\.upper\(\)', '.str_to_upper()'),
            (r'\[0\]', '[1]'),
            (r'f"([^"]*)"', r'paste0("\1")'),
            (r"f'([^']*)'", r"paste0('\1')"),
        ]


    def convert_imports(self, code: str) -> str:
        """Convert Python import statements to R library statements."""
        lines = code.split('\n')
        converted_lines = []


        for line in lines:
            line = line.strip()
            if line.startswith('import ') or line.startswith('from '):
                if ' as ' in line:
                    if 'import' in line and 'as' in line:
                        parts = line.split(' as ')
                        module = parts[0].replace('import ', '').strip()
                        if module in self.import_mappings:
                            converted_lines.append(f"# {line}")
                            converted_lines.append(self.import_mappings[module])
                        else:
                            converted_lines.append(f"# {line} # No direct R equivalent")
                    elif 'from' in line and 'import' in line and 'as' in line:
                        converted_lines.append(f"# {line} # Handle specific imports manually")
                elif line.startswith('from '):
                    parts = line.split(' import ')
                    module = parts[0].replace('from ', '').strip()
                    if module in self.import_mappings:
                        converted_lines.append(f"# {line}")
                        converted_lines.append(self.import_mappings[module])
                    else:
                        converted_lines.append(f"# {line} # No direct R equivalent")
                else:
                    module = line.replace('import ', '').strip()
                    if module in self.import_mappings:
                        converted_lines.append(f"# {line}")
                        converted_lines.append(self.import_mappings[module])
                    else:
                        converted_lines.append(f"# {line} # No direct R equivalent")
            else:
                converted_lines.append(line)


        return '\n'.join(converted_lines)


    def convert_functions(self, code: str) -> str:
        """Convert Python function calls to R equivalents."""
        for py_func, r_func in self.function_mappings.items():
            code = code.replace(py_func, r_func)
        return code


    def apply_syntax_patterns(self, code: str) -> str:
        """Apply regex patterns to convert Python syntax to R syntax."""
        for pattern, replacement in self.syntax_patterns:
            code = re.sub(pattern, replacement, code)
        return code


    def convert_pandas_operations(self, code: str) -> str:
        """Convert common pandas operations to dplyr/tidyr equivalents."""
        code = re.sub(r'df\[[\'"](.*?)[\'"]\]', r'df$\1', code)
        code = re.sub(r'df\.(\w+)', r'df$\1', code)


        code = re.sub(r'df\[df\[[\'"](.*?)[\'"]\]\s*([><=!]+)\s*([^]]+)\]', r'df[df$\1 \2 \3, ]', code)


        return code


    def convert_plotting(self, code: str) -> str:
        """Convert matplotlib/seaborn plotting to ggplot2."""
        conversions = [
            (r'plt\.figure\(figsize=\((\d+),\s*(\d+)\)\)', r'# Set figure size in ggplot theme'),
            (r'plt\.title\([\'"](.*?)[\'\"]\)', r'+ ggtitle("\1")'),
            (r'plt\.xlabel\([\'"](.*?)[\'\"]\)', r'+ xlab("\1")'),
            (r'plt\.ylabel\([\'"](.*?)[\'\"]\)', r'+ ylab("\1")'),
            (r'plt\.legend\(\)', r'+ theme(legend.position="right")'),
            (r'plt\.grid\(True\)', r'+ theme(panel.grid.major = element_line())'),
        ]


        for pattern, replacement in conversions:
            code = re.sub(pattern, replacement, code)


        return code


    def add_r_context(self, code: str) -> str:
        """Add R-specific context and comments."""
        r_header=""'# R Statistical Analysis Code
# Converted from Python using Enhanced Converter with Gemini AI Validation
# Install required packages: install.packages(c("dplyr", "ggplot2", "tidyr", "readr"))


'''
        return r_header + code


    def convert_code(self, python_code: str) -> str:
        """Main conversion method that applies all transformations."""
        code = python_code.strip()


        code = self.convert_imports(code)
        code = self.convert_functions(code)
        code = self.convert_pandas_operations(code)
        code = self.convert_plotting(code)
        code = self.apply_syntax_patterns(code)
        code = self.add_r_context(code)


        return code


    def convert_and_validate(self, python_code: str, use_gemini: bool = True) -> Dict:
        """
        Convert Python code to R and validate with Gemini AI
        """
        r_code = self.convert_code(python_code)


        result = {
            "original_python": python_code,
            "converted_r": r_code,
            "validation": None
        }


        if use_gemini and self.validator.api_key:
            print("🔍 Validating conversion with Gemini AI...")
            validation = self.validator.validate_conversion(python_code, r_code)
            result["validation"] = validation


            if validation.get("improved_code") and validation.get("improved_code") != r_code:
                result["final_r_code"] = validation["improved_code"]
            else:
                result["final_r_code"] = r_code
        else:
            result["final_r_code"] = r_code
            if not self.validator.api_key:
                result["validation"] = {"note": "Set GEMINI_API_KEY for AI validation"}


        return result


    def print_results(self, results: Dict):
        """Pretty print the conversion results"""
        print("=" * 80)
        print("🐍 ORIGINAL PYTHON CODE")
        print("=" * 80)
        print(results["original_python"])


        print("\n" + "=" * 80)
        print("📊 CONVERTED R CODE")
        print("=" * 80)
        print(results["final_r_code"])


        if results.get("validation"):
            validation = results["validation"]
            print("\n" + "=" * 80)
            print("🤖 GEMINI AI VALIDATION")
            print("=" * 80)


            if validation.get("validation_score"):
                print(f"📈 Score: {validation['validation_score']}/100")


            if validation.get("summary"):
                print(f"📝 Summary: {validation['summary']}")


            if validation.get("issues_found"):
                print("\n⚠️  Issues Found:")
                for issue in validation["issues_found"]:
                    print(f"   • {issue}")


            if validation.get("suggestions"):
                print("\n💡 Suggestions:")
                for suggestion in validation["suggestions"]:
                    print(f"   • {suggestion}")



Source link

  • Related Posts

    TikTok Researchers Introduce SWE-Perf: The First Benchmark for Repository-Level Code Performance Optimization

    Introduction As large language models (LLMs) advance in software engineering tasks—ranging from code generation to bug fixing—performance optimization remains an elusive frontier, especially at the repository level. To bridge this…

    Allen Institute for AI-Ai2 Unveils AutoDS: A Bayesian Surprise-Driven Engine for Open-Ended Scientific Discovery

    The Allen Institute for Artificial Intelligence (AI2) has introduced AutoDS (Autonomous Discovery via Surprisal), a groundbreaking prototype engine for open-ended autonomous scientific discovery. Distinct from conventional AI research assistants that…

    Leave a Reply

    Your email address will not be published. Required fields are marked *